LLVM 20.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
46#include <optional>
47
48using namespace llvm;
49
50#define DEBUG_TYPE "si-lower"
51
52STATISTIC(NumTailCalls, "Number of tail calls");
53
55 "amdgpu-disable-loop-alignment",
56 cl::desc("Do not align and prefetch loops"),
57 cl::init(false));
58
60 "amdgpu-use-divergent-register-indexing",
62 cl::desc("Use indirect register addressing for divergent indexes"),
63 cl::init(false));
64
69
74
75static unsigned findFirstFreeSGPR(CCState &CCInfo) {
76 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
77 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
79 return AMDGPU::SGPR0 + Reg;
80 }
81 }
82 llvm_unreachable("Cannot allocate sgpr");
83}
84
86 const GCNSubtarget &STI)
87 : AMDGPUTargetLowering(TM, STI),
88 Subtarget(&STI) {
89 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
90 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
91
92 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
93 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
94
95 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
99
100 addRegisterClass(MVT::f64, V64RegClass);
101 addRegisterClass(MVT::v2f32, V64RegClass);
102 addRegisterClass(MVT::Untyped, V64RegClass);
103
104 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
105 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
106
107 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
108 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
109
110 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
111 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
112
113 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
114 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
115
116 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
117 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
118
119 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
120 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
121
122 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
123 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
124
125 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
126 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
127
128 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
129 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
130
131 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
132 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
133
134 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
135 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
136
137 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
138 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
139
140 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
141 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
142
143 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
144 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
145
146 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
147 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
148
149 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
150 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
151
152 if (Subtarget->has16BitInsts()) {
153 if (Subtarget->useRealTrue16Insts()) {
154 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
156 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
157 } else {
158 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
160 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
161 }
162
163 // Unless there are also VOP3P operations, not operations are really legal.
164 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
167 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
170 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
173 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
176 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
178 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
179 }
180
181 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
182 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
183
185
186 // The boolean content concept here is too inflexible. Compares only ever
187 // really produce a 1-bit result. Any copy/extend from these will turn into a
188 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
189 // it's what most targets use.
192
193 // We need to custom lower vector stores from local memory
194 setOperationAction(ISD::LOAD,
195 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
196 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
197 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
198 MVT::i1, MVT::v32i32},
199 Custom);
200
201 setOperationAction(ISD::STORE,
202 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
203 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
204 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
205 MVT::i1, MVT::v32i32},
206 Custom);
207
208 if (isTypeLegal(MVT::bf16)) {
209 for (unsigned Opc :
211 ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
212 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
213 ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
214 ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
215 ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
216 ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
217 ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
218 ISD::SETCC}) {
219 // FIXME: The promoted to type shouldn't need to be explicit
220 setOperationAction(Opc, MVT::bf16, Promote);
221 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
222 }
223
225
227 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
228
229 setOperationAction(ISD::FABS, MVT::bf16, Legal);
230 setOperationAction(ISD::FNEG, MVT::bf16, Legal);
232
233 // We only need to custom lower because we can't specify an action for bf16
234 // sources.
237 }
238
239 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
240 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
241 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
242 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
243 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
244 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
245 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
246 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
247 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
248 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
249 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
250 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
251 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
252 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
253 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
254 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
255
256 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
257 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
258 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
259 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
260 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
262 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
263
264 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
265
269 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
270
271 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
272
274 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
275
277 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
278 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
279
281 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
282 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
283 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
284 Expand);
286 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
287 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
288 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
289 Expand);
290
292 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
293 MVT::v3i16, MVT::v4i16, MVT::Other},
294 Custom);
295
296 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
297 setOperationAction(ISD::BR_CC,
298 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
299
301
303
305 Expand);
306
307#if 0
309#endif
310
311 // We only support LOAD/STORE and vector manipulation ops for vectors
312 // with > 4 elements.
313 for (MVT VT :
314 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
315 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
316 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
317 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
318 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
319 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
320 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
321 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
322 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
323 switch (Op) {
324 case ISD::LOAD:
325 case ISD::STORE:
327 case ISD::BITCAST:
328 case ISD::UNDEF:
332 case ISD::IS_FPCLASS:
333 break;
338 break;
339 default:
341 break;
342 }
343 }
344 }
345
346 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
347
348 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
349 // is expanded to avoid having two separate loops in case the index is a VGPR.
350
351 // Most operations are naturally 32-bit vector operations. We only support
352 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
353 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
355 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
356
358 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
359
361 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
362
364 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
365 }
366
367 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
369 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
370
372 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
373
375 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
376
378 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
379 }
380
381 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
383 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
384
386 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
387
389 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
390
392 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
393 }
394
395 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
397 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
398
400 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
401
403 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
404
406 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
407 }
408
409 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
411 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
412
414 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
415
417 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
418
420 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
421 }
422
424 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
425 Expand);
426
427 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
428 Custom);
429
430 // Avoid stack access for these.
431 // TODO: Generalize to more vector types.
433 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
434 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
435 Custom);
436
437 // Deal with vec3 vector operations when widened to vec4.
439 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
440
441 // Deal with vec5/6/7 vector operations when widened to vec8.
443 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
444 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
445 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
446 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
447 Custom);
448
449 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
450 // and output demarshalling
451 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
452
453 // We can't return success/failure, only the old value,
454 // let LLVM add the comparison
455 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
456 Expand);
457
458 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
459
460 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
461
462 // FIXME: This should be narrowed to i32, but that only happens if i64 is
463 // illegal.
464 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
465 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
466
467 // On SI this is s_memtime and s_memrealtime on VI.
468 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
469
470 if (Subtarget->hasSMemRealTime() ||
472 setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64, Legal);
473 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
474
475 if (Subtarget->has16BitInsts()) {
476 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
477 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
478 } else {
479 setOperationAction(ISD::FSQRT, MVT::f16, Custom);
480 }
481
482 if (Subtarget->hasMadMacF32Insts())
484
485 if (!Subtarget->hasBFI())
486 // fcopysign can be done in a single instruction with BFI.
487 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
488
489 if (!Subtarget->hasBCNT(32))
491
492 if (!Subtarget->hasBCNT(64))
494
495 if (Subtarget->hasFFBH())
497
498 if (Subtarget->hasFFBL())
500
501 // We only really have 32-bit BFE instructions (and 16-bit on VI).
502 //
503 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
504 // effort to match them now. We want this to be false for i64 cases when the
505 // extraction isn't restricted to the upper or lower half. Ideally we would
506 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
507 // span the midpoint are probably relatively rare, so don't worry about them
508 // for now.
509 if (Subtarget->hasBFE())
511
512 // Clamp modifier on add/sub
513 if (Subtarget->hasIntClamp())
515
516 if (Subtarget->hasAddNoCarry())
517 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
518 Legal);
519
520 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
521 Custom);
522
523 // These are really only legal for ieee_mode functions. We should be avoiding
524 // them for functions that don't have ieee_mode enabled, so just say they are
525 // legal.
526 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
527 {MVT::f32, MVT::f64}, Legal);
528
529 if (Subtarget->haveRoundOpsF64())
530 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
531 Legal);
532 else
533 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
534 MVT::f64, Custom);
535
536 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
537 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
538 Legal);
539 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
540
541 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
543
544 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
545 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
546
547 // Custom lower these because we can't specify a rule based on an illegal
548 // source bf16.
549 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
550 setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);
551
552 if (Subtarget->has16BitInsts()) {
555 MVT::i16, Legal);
556
557 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
558
560 MVT::i16, Expand);
561
565 ISD::CTPOP},
566 MVT::i16, Promote);
567
568 setOperationAction(ISD::LOAD, MVT::i16, Custom);
569
570 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
571
572 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
573 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
574 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
575 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
576
580
582
583 // F16 - Constant Actions.
586
587 // F16 - Load/Store Actions.
588 setOperationAction(ISD::LOAD, MVT::f16, Promote);
589 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
590 setOperationAction(ISD::STORE, MVT::f16, Promote);
591 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
592
593 // BF16 - Load/Store Actions.
594 setOperationAction(ISD::LOAD, MVT::bf16, Promote);
595 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
596 setOperationAction(ISD::STORE, MVT::bf16, Promote);
597 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
598
599 // F16 - VOP1 Actions.
601 ISD::FSIN, ISD::FROUND},
602 MVT::f16, Custom);
603
606
607 // F16 - VOP2 Actions.
608 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
609 Expand);
610 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
611 setOperationAction(ISD::FFREXP, MVT::f16, Custom);
613
614 // F16 - VOP3 Actions.
616 if (STI.hasMadF16())
618
619 for (MVT VT :
620 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
621 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
622 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
623 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
624 switch (Op) {
625 case ISD::LOAD:
626 case ISD::STORE:
628 case ISD::BITCAST:
629 case ISD::UNDEF:
634 case ISD::IS_FPCLASS:
635 break;
639 break;
640 default:
642 break;
643 }
644 }
645 }
646
647 // v_perm_b32 can handle either of these.
648 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
650
651 // XXX - Do these do anything? Vector constants turn into build_vector.
652 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
653
654 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
655 Legal);
656
657 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
658 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
659 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
660 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
661
662 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
663 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
664 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
665 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
666
667 setOperationAction(ISD::AND, MVT::v2i16, Promote);
668 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
669 setOperationAction(ISD::OR, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
671 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
672 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
673
674 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
675 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
676 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
677 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
678 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
679 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
680
681 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
682 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
683 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
684 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
685 setOperationAction(ISD::STORE, MVT::v4bf16, Promote);
686 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
687
688 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
689 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
690 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
691 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
692 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
693 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
694
695 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
696 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
697 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
698 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
699
700 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
701 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
702 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
703 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
704 setOperationAction(ISD::STORE, MVT::v8bf16, Promote);
705 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
706
707 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
708 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
709 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
711 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
713
714 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
715 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
716 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
717 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
718 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
719 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
720
721 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
722 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
723 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
725 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
727
728 setOperationAction(ISD::STORE, MVT::v32i16, Promote);
729 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
730 setOperationAction(ISD::STORE, MVT::v32f16, Promote);
731 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
732 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
733 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
734
736 MVT::v2i32, Expand);
737 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
738
740 MVT::v4i32, Expand);
741
743 MVT::v8i32, Expand);
744
745 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
746 Subtarget->hasVOP3PInsts() ? Legal : Custom);
747
748 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
749 // This isn't really legal, but this avoids the legalizer unrolling it (and
750 // allows matching fneg (fabs x) patterns)
751 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
752
753 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
754 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
755
756 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::FMINIMUMNUM,
757 ISD::FMAXIMUMNUM},
758 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
759 Custom);
760
761 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
762 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
763 Expand);
764
765 for (MVT Vec16 :
766 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
767 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
770 Vec16, Custom);
772 }
773 }
774
775 if (Subtarget->hasVOP3PInsts()) {
779 MVT::v2i16, Legal);
780
781 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
782 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
783 MVT::v2f16, Legal);
784
785 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
786 Custom);
787
789 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
790 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
791 Custom);
792
793 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
794 // Split vector operations.
799 VT, Custom);
800
801 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
802 // Split vector operations.
804 VT, Custom);
805
806 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
807 Custom);
808
809 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
810 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
811 Custom);
812
813 if (Subtarget->hasPackedFP32Ops()) {
815 MVT::v2f32, Legal);
817 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
818 Custom);
819 }
820 }
821
822 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
823
824 if (Subtarget->has16BitInsts()) {
826 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
828 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
829 } else {
830 // Legalization hack.
831 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
832
833 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
834 }
835
837 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
838 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
839 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
840 MVT::v32f16, MVT::v32bf16},
841 Custom);
842
844
845 if (Subtarget->hasScalarSMulU64())
847
848 if (Subtarget->hasMad64_32())
850
851 if (Subtarget->hasPrefetch())
852 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
853
854 if (Subtarget->hasIEEEMinMax()) {
855 setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
856 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
857 setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
858 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
859 Custom);
860 }
861
863 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
864 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
865 MVT::i8},
866 Custom);
867
869 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
870 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
871 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
872 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
873 Custom);
874
876 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
877 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
878 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
879 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
880 Custom);
881
882 setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
884 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
885 setOperationAction(ISD::GET_FPENV, MVT::i64, Custom);
886 setOperationAction(ISD::SET_FPENV, MVT::i64, Custom);
887
888 // TODO: Could move this to custom lowering, could benefit from combines on
889 // extract of relevant bits.
890 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
891
893
896 ISD::SUB,
898 ISD::MUL,
899 ISD::FADD,
900 ISD::FSUB,
901 ISD::FDIV,
902 ISD::FMINNUM,
903 ISD::FMAXNUM,
904 ISD::FMINNUM_IEEE,
905 ISD::FMAXNUM_IEEE,
906 ISD::FMINIMUM,
907 ISD::FMAXIMUM,
908 ISD::FMA,
909 ISD::SMIN,
910 ISD::SMAX,
911 ISD::UMIN,
912 ISD::UMAX,
915 ISD::SMIN,
916 ISD::SMAX,
917 ISD::UMIN,
918 ISD::UMAX,
919 ISD::AND,
920 ISD::OR,
921 ISD::XOR,
922 ISD::SHL,
923 ISD::SRL,
924 ISD::SRA,
925 ISD::FSHR,
935
936 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
938
939 // All memory operations. Some folding on the pointer operand is done to help
940 // matching the constant offsets in the addressing modes.
941 setTargetDAGCombine({ISD::LOAD,
942 ISD::STORE,
943 ISD::ATOMIC_LOAD,
944 ISD::ATOMIC_STORE,
945 ISD::ATOMIC_CMP_SWAP,
946 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
947 ISD::ATOMIC_SWAP,
948 ISD::ATOMIC_LOAD_ADD,
949 ISD::ATOMIC_LOAD_SUB,
950 ISD::ATOMIC_LOAD_AND,
951 ISD::ATOMIC_LOAD_OR,
952 ISD::ATOMIC_LOAD_XOR,
953 ISD::ATOMIC_LOAD_NAND,
954 ISD::ATOMIC_LOAD_MIN,
955 ISD::ATOMIC_LOAD_MAX,
956 ISD::ATOMIC_LOAD_UMIN,
957 ISD::ATOMIC_LOAD_UMAX,
958 ISD::ATOMIC_LOAD_FADD,
959 ISD::ATOMIC_LOAD_FMIN,
960 ISD::ATOMIC_LOAD_FMAX,
961 ISD::ATOMIC_LOAD_UINC_WRAP,
962 ISD::ATOMIC_LOAD_UDEC_WRAP,
965
966 // FIXME: In other contexts we pretend this is a per-function property.
968
970}
971
973 return Subtarget;
974}
975
977 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
978 return RCRegs;
979}
980
981//===----------------------------------------------------------------------===//
982// TargetLowering queries
983//===----------------------------------------------------------------------===//
984
985// v_mad_mix* support a conversion from f16 to f32.
986//
987// There is only one special case when denormals are enabled we don't currently,
988// where this is OK to use.
989bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
990 EVT DestVT, EVT SrcVT) const {
991 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
992 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
993 DestVT.getScalarType() == MVT::f32 &&
994 SrcVT.getScalarType() == MVT::f16 &&
995 // TODO: This probably only requires no input flushing?
997}
998
1000 LLT DestTy, LLT SrcTy) const {
1001 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1002 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1003 DestTy.getScalarSizeInBits() == 32 &&
1004 SrcTy.getScalarSizeInBits() == 16 &&
1005 // TODO: This probably only requires no input flushing?
1006 denormalModeIsFlushAllF32(*MI.getMF());
1007}
1008
1010 // SI has some legal vector types, but no legal vector operations. Say no
1011 // shuffles are legal in order to prefer scalarizing some vector operations.
1012 return false;
1013}
1014
1016 CallingConv::ID CC,
1017 EVT VT) const {
1020
1021 if (VT.isVector()) {
1022 EVT ScalarVT = VT.getScalarType();
1023 unsigned Size = ScalarVT.getSizeInBits();
1024 if (Size == 16) {
1025 if (Subtarget->has16BitInsts()) {
1026 if (VT.isInteger())
1027 return MVT::v2i16;
1028 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1029 }
1030 return VT.isInteger() ? MVT::i32 : MVT::f32;
1031 }
1032
1033 if (Size < 16)
1034 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1035 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1036 }
1037
1038 if (VT.getSizeInBits() > 32)
1039 return MVT::i32;
1040
1042}
1043
1045 CallingConv::ID CC,
1046 EVT VT) const {
1049
1050 if (VT.isVector()) {
1051 unsigned NumElts = VT.getVectorNumElements();
1052 EVT ScalarVT = VT.getScalarType();
1053 unsigned Size = ScalarVT.getSizeInBits();
1054
1055 // FIXME: Should probably promote 8-bit vectors to i16.
1056 if (Size == 16 && Subtarget->has16BitInsts())
1057 return (NumElts + 1) / 2;
1058
1059 if (Size <= 32)
1060 return NumElts;
1061
1062 if (Size > 32)
1063 return NumElts * ((Size + 31) / 32);
1064 } else if (VT.getSizeInBits() > 32)
1065 return (VT.getSizeInBits() + 31) / 32;
1066
1068}
1069
1071 LLVMContext &Context, CallingConv::ID CC,
1072 EVT VT, EVT &IntermediateVT,
1073 unsigned &NumIntermediates, MVT &RegisterVT) const {
1074 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1075 unsigned NumElts = VT.getVectorNumElements();
1076 EVT ScalarVT = VT.getScalarType();
1077 unsigned Size = ScalarVT.getSizeInBits();
1078 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1079 // support, but unless we can properly handle 3-vectors, it will be still be
1080 // inconsistent.
1081 if (Size == 16 && Subtarget->has16BitInsts()) {
1082 if (ScalarVT == MVT::bf16) {
1083 RegisterVT = MVT::i32;
1084 IntermediateVT = MVT::v2bf16;
1085 } else {
1086 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1087 IntermediateVT = RegisterVT;
1088 }
1089 NumIntermediates = (NumElts + 1) / 2;
1090 return NumIntermediates;
1091 }
1092
1093 if (Size == 32) {
1094 RegisterVT = ScalarVT.getSimpleVT();
1095 IntermediateVT = RegisterVT;
1096 NumIntermediates = NumElts;
1097 return NumIntermediates;
1098 }
1099
1100 if (Size < 16 && Subtarget->has16BitInsts()) {
1101 // FIXME: Should probably form v2i16 pieces
1102 RegisterVT = MVT::i16;
1103 IntermediateVT = ScalarVT;
1104 NumIntermediates = NumElts;
1105 return NumIntermediates;
1106 }
1107
1108
1109 if (Size != 16 && Size <= 32) {
1110 RegisterVT = MVT::i32;
1111 IntermediateVT = ScalarVT;
1112 NumIntermediates = NumElts;
1113 return NumIntermediates;
1114 }
1115
1116 if (Size > 32) {
1117 RegisterVT = MVT::i32;
1118 IntermediateVT = RegisterVT;
1119 NumIntermediates = NumElts * ((Size + 31) / 32);
1120 return NumIntermediates;
1121 }
1122 }
1123
1125 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1126}
1127
1129 const DataLayout &DL, Type *Ty,
1130 unsigned MaxNumLanes) {
1131 assert(MaxNumLanes != 0);
1132
1133 LLVMContext &Ctx = Ty->getContext();
1134 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1135 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1136 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1137 NumElts);
1138 }
1139
1140 return TLI.getValueType(DL, Ty);
1141}
1142
1143// Peek through TFE struct returns to only use the data size.
1145 const DataLayout &DL, Type *Ty,
1146 unsigned MaxNumLanes) {
1147 auto *ST = dyn_cast<StructType>(Ty);
1148 if (!ST)
1149 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1150
1151 // TFE intrinsics return an aggregate type.
1152 assert(ST->getNumContainedTypes() == 2 &&
1153 ST->getContainedType(1)->isIntegerTy(32));
1154 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1155}
1156
1157/// Map address space 7 to MVT::v5i32 because that's its in-memory
1158/// representation. This return value is vector-typed because there is no
1159/// MVT::i160 and it is not clear if one can be added. While this could
1160/// cause issues during codegen, these address space 7 pointers will be
1161/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1162/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1163/// modeling, to work.
1165 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1166 return MVT::v5i32;
1168 DL.getPointerSizeInBits(AS) == 192)
1169 return MVT::v6i32;
1171}
1172/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1173/// v8i32 when padding is added.
1174/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1175/// also v8i32 with padding.
1177 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1178 DL.getPointerSizeInBits(AS) == 160) ||
1180 DL.getPointerSizeInBits(AS) == 192))
1181 return MVT::v8i32;
1183}
1184
1186 const CallInst &CI,
1187 MachineFunction &MF,
1188 unsigned IntrID) const {
1189 Info.flags = MachineMemOperand::MONone;
1190 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1191 Info.flags |= MachineMemOperand::MOInvariant;
1192
1193 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1196 (Intrinsic::ID)IntrID);
1197 MemoryEffects ME = Attr.getMemoryEffects();
1198 if (ME.doesNotAccessMemory())
1199 return false;
1200
1201 // TODO: Should images get their own address space?
1202 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1203
1204 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1205 if (RsrcIntr->IsImage) {
1208 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1209 Info.align.reset();
1210 }
1211
1212 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1213 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1214 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1215 // We conservatively set the memory operand of a buffer intrinsic to the
1216 // base resource pointer, so that we can access alias information about
1217 // those pointers. Cases like "this points at the same value
1218 // but with a different offset" are handled in
1219 // areMemAccessesTriviallyDisjoint.
1220 Info.ptrVal = RsrcArg;
1221 }
1222
1223 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1224 if (!IsSPrefetch) {
1225 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1226 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1227 Info.flags |= MachineMemOperand::MOVolatile;
1228 }
1229
1231 if (ME.onlyReadsMemory()) {
1232 if (RsrcIntr->IsImage) {
1233 unsigned MaxNumLanes = 4;
1234
1235 if (!BaseOpcode->Gather4) {
1236 // If this isn't a gather, we may have excess loaded elements in the
1237 // IR type. Check the dmask for the real number of elements loaded.
1238 unsigned DMask
1239 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1240 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1241 }
1242
1243 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1244 CI.getType(), MaxNumLanes);
1245 } else {
1246 Info.memVT =
1248 std::numeric_limits<unsigned>::max());
1249 }
1250
1251 // FIXME: What does alignment mean for an image?
1252 Info.opc = ISD::INTRINSIC_W_CHAIN;
1253 Info.flags |= MachineMemOperand::MOLoad;
1254 } else if (ME.onlyWritesMemory()) {
1255 Info.opc = ISD::INTRINSIC_VOID;
1256
1257 Type *DataTy = CI.getArgOperand(0)->getType();
1258 if (RsrcIntr->IsImage) {
1259 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1260 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1261 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1262 DMaskLanes);
1263 } else
1264 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1265
1266 Info.flags |= MachineMemOperand::MOStore;
1267 } else {
1268 // Atomic, NoReturn Sampler or prefetch
1269 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1271 Info.flags |=
1273
1274 if (!IsSPrefetch)
1275 Info.flags |= MachineMemOperand::MOStore;
1276
1277 switch (IntrID) {
1278 default:
1279 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1280 // Fake memory access type for no return sampler intrinsics
1281 Info.memVT = MVT::i32;
1282 } else {
1283 // XXX - Should this be volatile without known ordering?
1284 Info.flags |= MachineMemOperand::MOVolatile;
1285 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1286 }
1287 break;
1288 case Intrinsic::amdgcn_raw_buffer_load_lds:
1289 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1290 case Intrinsic::amdgcn_struct_buffer_load_lds:
1291 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1292 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1293 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1294 Info.ptrVal = CI.getArgOperand(1);
1295 return true;
1296 }
1297 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1298 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1299 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1300 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1301 Info.memVT =
1303 std::numeric_limits<unsigned>::max());
1304 Info.flags &= ~MachineMemOperand::MOStore;
1305 return true;
1306 }
1307 }
1308 }
1309 return true;
1310 }
1311
1312 switch (IntrID) {
1313 case Intrinsic::amdgcn_ds_ordered_add:
1314 case Intrinsic::amdgcn_ds_ordered_swap: {
1315 Info.opc = ISD::INTRINSIC_W_CHAIN;
1316 Info.memVT = MVT::getVT(CI.getType());
1317 Info.ptrVal = CI.getOperand(0);
1318 Info.align.reset();
1320
1321 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1322 if (!Vol->isZero())
1323 Info.flags |= MachineMemOperand::MOVolatile;
1324
1325 return true;
1326 }
1327 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1328 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1329 Info.opc = ISD::INTRINSIC_W_CHAIN;
1330 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1331 Info.ptrVal = nullptr;
1332 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1334 return true;
1335 }
1336 case Intrinsic::amdgcn_ds_append:
1337 case Intrinsic::amdgcn_ds_consume: {
1338 Info.opc = ISD::INTRINSIC_W_CHAIN;
1339 Info.memVT = MVT::getVT(CI.getType());
1340 Info.ptrVal = CI.getOperand(0);
1341 Info.align.reset();
1343
1344 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1345 if (!Vol->isZero())
1346 Info.flags |= MachineMemOperand::MOVolatile;
1347
1348 return true;
1349 }
1350 case Intrinsic::amdgcn_global_atomic_csub: {
1351 Info.opc = ISD::INTRINSIC_W_CHAIN;
1352 Info.memVT = MVT::getVT(CI.getType());
1353 Info.ptrVal = CI.getOperand(0);
1354 Info.align.reset();
1355 Info.flags |= MachineMemOperand::MOLoad |
1358 return true;
1359 }
1360 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1361 Info.opc = ISD::INTRINSIC_W_CHAIN;
1362 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1363
1364 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1365 Info.align.reset();
1366 Info.flags |= MachineMemOperand::MOLoad |
1368 return true;
1369 }
1370 case Intrinsic::amdgcn_global_atomic_fmin:
1371 case Intrinsic::amdgcn_global_atomic_fmax:
1372 case Intrinsic::amdgcn_global_atomic_fmin_num:
1373 case Intrinsic::amdgcn_global_atomic_fmax_num:
1374 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1375 case Intrinsic::amdgcn_flat_atomic_fmin:
1376 case Intrinsic::amdgcn_flat_atomic_fmax:
1377 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1378 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1379 case Intrinsic::amdgcn_atomic_cond_sub_u32: {
1380 Info.opc = ISD::INTRINSIC_W_CHAIN;
1381 Info.memVT = MVT::getVT(CI.getType());
1382 Info.ptrVal = CI.getOperand(0);
1383 Info.align.reset();
1384 Info.flags |= MachineMemOperand::MOLoad |
1388 return true;
1389 }
1390 case Intrinsic::amdgcn_global_load_tr_b64:
1391 case Intrinsic::amdgcn_global_load_tr_b128: {
1392 Info.opc = ISD::INTRINSIC_W_CHAIN;
1393 Info.memVT = MVT::getVT(CI.getType());
1394 Info.ptrVal = CI.getOperand(0);
1395 Info.align.reset();
1396 Info.flags |= MachineMemOperand::MOLoad;
1397 return true;
1398 }
1399 case Intrinsic::amdgcn_ds_gws_init:
1400 case Intrinsic::amdgcn_ds_gws_barrier:
1401 case Intrinsic::amdgcn_ds_gws_sema_v:
1402 case Intrinsic::amdgcn_ds_gws_sema_br:
1403 case Intrinsic::amdgcn_ds_gws_sema_p:
1404 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1405 Info.opc = ISD::INTRINSIC_VOID;
1406
1407 const GCNTargetMachine &TM =
1408 static_cast<const GCNTargetMachine &>(getTargetMachine());
1409
1411 Info.ptrVal = MFI->getGWSPSV(TM);
1412
1413 // This is an abstract access, but we need to specify a type and size.
1414 Info.memVT = MVT::i32;
1415 Info.size = 4;
1416 Info.align = Align(4);
1417
1418 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1419 Info.flags |= MachineMemOperand::MOLoad;
1420 else
1421 Info.flags |= MachineMemOperand::MOStore;
1422 return true;
1423 }
1424 case Intrinsic::amdgcn_global_load_lds: {
1425 Info.opc = ISD::INTRINSIC_VOID;
1426 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1427 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1428 Info.ptrVal = CI.getArgOperand(1);
1430 return true;
1431 }
1432 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1433 Info.opc = ISD::INTRINSIC_W_CHAIN;
1434
1435 const GCNTargetMachine &TM =
1436 static_cast<const GCNTargetMachine &>(getTargetMachine());
1437
1439 Info.ptrVal = MFI->getGWSPSV(TM);
1440
1441 // This is an abstract access, but we need to specify a type and size.
1442 Info.memVT = MVT::i32;
1443 Info.size = 4;
1444 Info.align = Align(4);
1445
1447 return true;
1448 }
1449 case Intrinsic::amdgcn_s_prefetch_data: {
1450 Info.opc = ISD::INTRINSIC_VOID;
1451 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1452 Info.ptrVal = CI.getArgOperand(0);
1453 Info.flags |= MachineMemOperand::MOLoad;
1454 return true;
1455 }
1456 default:
1457 return false;
1458 }
1459}
1460
1462 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1464 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1465 // The DAG's ValueType loses the addrspaces.
1466 // Add them as 2 extra Constant operands "from" and "to".
1467 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1468 unsigned DstAS = I.getType()->getPointerAddressSpace();
1469 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1470 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1471 break;
1472 }
1473 default:
1474 break;
1475 }
1476}
1477
1480 Type *&AccessTy) const {
1481 Value *Ptr = nullptr;
1482 switch (II->getIntrinsicID()) {
1483 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1484 case Intrinsic::amdgcn_ds_append:
1485 case Intrinsic::amdgcn_ds_consume:
1486 case Intrinsic::amdgcn_ds_ordered_add:
1487 case Intrinsic::amdgcn_ds_ordered_swap:
1488 case Intrinsic::amdgcn_flat_atomic_fmax:
1489 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1490 case Intrinsic::amdgcn_flat_atomic_fmin:
1491 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1492 case Intrinsic::amdgcn_global_atomic_csub:
1493 case Intrinsic::amdgcn_global_atomic_fmax:
1494 case Intrinsic::amdgcn_global_atomic_fmax_num:
1495 case Intrinsic::amdgcn_global_atomic_fmin:
1496 case Intrinsic::amdgcn_global_atomic_fmin_num:
1497 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1498 case Intrinsic::amdgcn_global_load_tr_b64:
1499 case Intrinsic::amdgcn_global_load_tr_b128:
1500 Ptr = II->getArgOperand(0);
1501 break;
1502 case Intrinsic::amdgcn_global_load_lds:
1503 Ptr = II->getArgOperand(1);
1504 break;
1505 default:
1506 return false;
1507 }
1508 AccessTy = II->getType();
1509 Ops.push_back(Ptr);
1510 return true;
1511}
1512
1514 unsigned AddrSpace) const {
1515 if (!Subtarget->hasFlatInstOffsets()) {
1516 // Flat instructions do not have offsets, and only have the register
1517 // address.
1518 return AM.BaseOffs == 0 && AM.Scale == 0;
1519 }
1520
1521 decltype(SIInstrFlags::FLAT) FlatVariant =
1525
1526 return AM.Scale == 0 &&
1527 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1528 AM.BaseOffs, AddrSpace, FlatVariant));
1529}
1530
1532 if (Subtarget->hasFlatGlobalInsts())
1534
1535 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1536 // Assume the we will use FLAT for all global memory accesses
1537 // on VI.
1538 // FIXME: This assumption is currently wrong. On VI we still use
1539 // MUBUF instructions for the r + i addressing mode. As currently
1540 // implemented, the MUBUF instructions only work on buffer < 4GB.
1541 // It may be possible to support > 4GB buffers with MUBUF instructions,
1542 // by setting the stride value in the resource descriptor which would
1543 // increase the size limit to (stride * 4GB). However, this is risky,
1544 // because it has never been validated.
1546 }
1547
1548 return isLegalMUBUFAddressingMode(AM);
1549}
1550
1551bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1552 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1553 // additionally can do r + r + i with addr64. 32-bit has more addressing
1554 // mode options. Depending on the resource constant, it can also do
1555 // (i64 r0) + (i32 r1) * (i14 i).
1556 //
1557 // Private arrays end up using a scratch buffer most of the time, so also
1558 // assume those use MUBUF instructions. Scratch loads / stores are currently
1559 // implemented as mubuf instructions with offen bit set, so slightly
1560 // different than the normal addr64.
1561 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1562 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1563 return false;
1564
1565 // FIXME: Since we can split immediate into soffset and immediate offset,
1566 // would it make sense to allow any immediate?
1567
1568 switch (AM.Scale) {
1569 case 0: // r + i or just i, depending on HasBaseReg.
1570 return true;
1571 case 1:
1572 return true; // We have r + r or r + i.
1573 case 2:
1574 if (AM.HasBaseReg) {
1575 // Reject 2 * r + r.
1576 return false;
1577 }
1578
1579 // Allow 2 * r as r + r
1580 // Or 2 * r + i is allowed as r + r + i.
1581 return true;
1582 default: // Don't allow n * r
1583 return false;
1584 }
1585}
1586
1588 const AddrMode &AM, Type *Ty,
1589 unsigned AS, Instruction *I) const {
1590 // No global is ever allowed as a base.
1591 if (AM.BaseGV)
1592 return false;
1593
1594 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1595 return isLegalGlobalAddressingMode(AM);
1596
1597 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1601 // If the offset isn't a multiple of 4, it probably isn't going to be
1602 // correctly aligned.
1603 // FIXME: Can we get the real alignment here?
1604 if (AM.BaseOffs % 4 != 0)
1605 return isLegalMUBUFAddressingMode(AM);
1606
1607 if (!Subtarget->hasScalarSubwordLoads()) {
1608 // There are no SMRD extloads, so if we have to do a small type access we
1609 // will use a MUBUF load.
1610 // FIXME?: We also need to do this if unaligned, but we don't know the
1611 // alignment here.
1612 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1613 return isLegalGlobalAddressingMode(AM);
1614 }
1615
1617 // SMRD instructions have an 8-bit, dword offset on SI.
1618 if (!isUInt<8>(AM.BaseOffs / 4))
1619 return false;
1620 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1621 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1622 // in 8-bits, it can use a smaller encoding.
1623 if (!isUInt<32>(AM.BaseOffs / 4))
1624 return false;
1625 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1626 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1627 if (!isUInt<20>(AM.BaseOffs))
1628 return false;
1629 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1630 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1631 // for S_BUFFER_* instructions).
1632 if (!isInt<21>(AM.BaseOffs))
1633 return false;
1634 } else {
1635 // On GFX12, all offsets are signed 24-bit in bytes.
1636 if (!isInt<24>(AM.BaseOffs))
1637 return false;
1638 }
1639
1640 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
1642 AM.BaseOffs < 0) {
1643 // Scalar (non-buffer) loads can only use a negative offset if
1644 // soffset+offset is non-negative. Since the compiler can only prove that
1645 // in a few special cases, it is safer to claim that negative offsets are
1646 // not supported.
1647 return false;
1648 }
1649
1650 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1651 return true;
1652
1653 if (AM.Scale == 1 && AM.HasBaseReg)
1654 return true;
1655
1656 return false;
1657 }
1658
1659 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1660 return Subtarget->enableFlatScratch()
1662 : isLegalMUBUFAddressingMode(AM);
1663
1664 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1665 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1666 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1667 // field.
1668 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1669 // an 8-bit dword offset but we don't know the alignment here.
1670 if (!isUInt<16>(AM.BaseOffs))
1671 return false;
1672
1673 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1674 return true;
1675
1676 if (AM.Scale == 1 && AM.HasBaseReg)
1677 return true;
1678
1679 return false;
1680 }
1681
1683 // For an unknown address space, this usually means that this is for some
1684 // reason being used for pure arithmetic, and not based on some addressing
1685 // computation. We don't have instructions that compute pointers with any
1686 // addressing modes, so treat them as having no offset like flat
1687 // instructions.
1689 }
1690
1691 // Assume a user alias of global for unknown address spaces.
1692 return isLegalGlobalAddressingMode(AM);
1693}
1694
1696 const MachineFunction &MF) const {
1698 return (MemVT.getSizeInBits() <= 4 * 32);
1699 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1700 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1701 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1702 }
1704 return (MemVT.getSizeInBits() <= 2 * 32);
1705 return true;
1706}
1707
1709 unsigned Size, unsigned AddrSpace, Align Alignment,
1710 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1711 if (IsFast)
1712 *IsFast = 0;
1713
1714 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1715 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1716 // Check if alignment requirements for ds_read/write instructions are
1717 // disabled.
1718 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1719 return false;
1720
1721 Align RequiredAlignment(
1722 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
1723 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1724 Alignment < RequiredAlignment)
1725 return false;
1726
1727 // Either, the alignment requirements are "enabled", or there is an
1728 // unaligned LDS access related hardware bug though alignment requirements
1729 // are "disabled". In either case, we need to check for proper alignment
1730 // requirements.
1731 //
1732 switch (Size) {
1733 case 64:
1734 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1735 // address is negative, then the instruction is incorrectly treated as
1736 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1737 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1738 // load later in the SILoadStoreOptimizer.
1739 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1740 return false;
1741
1742 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1743 // can do a 4 byte aligned, 8 byte access in a single operation using
1744 // ds_read2/write2_b32 with adjacent offsets.
1745 RequiredAlignment = Align(4);
1746
1747 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1748 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1749 // ds_write2_b32 depending on the alignment. In either case with either
1750 // alignment there is no faster way of doing this.
1751
1752 // The numbers returned here and below are not additive, it is a 'speed
1753 // rank'. They are just meant to be compared to decide if a certain way
1754 // of lowering an operation is faster than another. For that purpose
1755 // naturally aligned operation gets it bitsize to indicate that "it
1756 // operates with a speed comparable to N-bit wide load". With the full
1757 // alignment ds128 is slower than ds96 for example. If underaligned it
1758 // is comparable to a speed of a single dword access, which would then
1759 // mean 32 < 128 and it is faster to issue a wide load regardless.
1760 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1761 // wider load which will not be aligned anymore the latter is slower.
1762 if (IsFast)
1763 *IsFast = (Alignment >= RequiredAlignment) ? 64
1764 : (Alignment < Align(4)) ? 32
1765 : 1;
1766 return true;
1767 }
1768
1769 break;
1770 case 96:
1771 if (!Subtarget->hasDS96AndDS128())
1772 return false;
1773
1774 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1775 // gfx8 and older.
1776
1777 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1778 // Naturally aligned access is fastest. However, also report it is Fast
1779 // if memory is aligned less than DWORD. A narrow load or store will be
1780 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1781 // be more of them, so overall we will pay less penalty issuing a single
1782 // instruction.
1783
1784 // See comment on the values above.
1785 if (IsFast)
1786 *IsFast = (Alignment >= RequiredAlignment) ? 96
1787 : (Alignment < Align(4)) ? 32
1788 : 1;
1789 return true;
1790 }
1791
1792 break;
1793 case 128:
1794 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1795 return false;
1796
1797 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1798 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1799 // single operation using ds_read2/write2_b64.
1800 RequiredAlignment = Align(8);
1801
1802 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1803 // Naturally aligned access is fastest. However, also report it is Fast
1804 // if memory is aligned less than DWORD. A narrow load or store will be
1805 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1806 // will be more of them, so overall we will pay less penalty issuing a
1807 // single instruction.
1808
1809 // See comment on the values above.
1810 if (IsFast)
1811 *IsFast = (Alignment >= RequiredAlignment) ? 128
1812 : (Alignment < Align(4)) ? 32
1813 : 1;
1814 return true;
1815 }
1816
1817 break;
1818 default:
1819 if (Size > 32)
1820 return false;
1821
1822 break;
1823 }
1824
1825 // See comment on the values above.
1826 // Note that we have a single-dword or sub-dword here, so if underaligned
1827 // it is a slowest possible access, hence returned value is 0.
1828 if (IsFast)
1829 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1830
1831 return Alignment >= RequiredAlignment ||
1832 Subtarget->hasUnalignedDSAccessEnabled();
1833 }
1834
1835 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1836 bool AlignedBy4 = Alignment >= Align(4);
1837 if (IsFast)
1838 *IsFast = AlignedBy4;
1839
1840 return AlignedBy4 ||
1841 Subtarget->enableFlatScratch() ||
1842 Subtarget->hasUnalignedScratchAccess();
1843 }
1844
1845 // FIXME: We have to be conservative here and assume that flat operations
1846 // will access scratch. If we had access to the IR function, then we
1847 // could determine if any private memory was used in the function.
1848 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1849 !Subtarget->hasUnalignedScratchAccess()) {
1850 bool AlignedBy4 = Alignment >= Align(4);
1851 if (IsFast)
1852 *IsFast = AlignedBy4;
1853
1854 return AlignedBy4;
1855 }
1856
1857 // So long as they are correct, wide global memory operations perform better
1858 // than multiple smaller memory ops -- even when misaligned
1859 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1860 if (IsFast)
1861 *IsFast = Size;
1862
1863 return Alignment >= Align(4) ||
1865 }
1866
1867 // Smaller than dword value must be aligned.
1868 if (Size < 32)
1869 return false;
1870
1871 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1872 // byte-address are ignored, thus forcing Dword alignment.
1873 // This applies to private, global, and constant memory.
1874 if (IsFast)
1875 *IsFast = 1;
1876
1877 return Size >= 32 && Alignment >= Align(4);
1878}
1879
1881 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1882 unsigned *IsFast) const {
1884 Alignment, Flags, IsFast);
1885}
1886
1888 const MemOp &Op, const AttributeList &FuncAttributes) const {
1889 // FIXME: Should account for address space here.
1890
1891 // The default fallback uses the private pointer size as a guess for a type to
1892 // use. Make sure we switch these to 64-bit accesses.
1893
1894 if (Op.size() >= 16 &&
1895 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1896 return MVT::v4i32;
1897
1898 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1899 return MVT::v2i32;
1900
1901 // Use the default.
1902 return MVT::Other;
1903}
1904
1906 const MemSDNode *MemNode = cast<MemSDNode>(N);
1907 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1908}
1909
1914
1916 unsigned DestAS) const {
1917 // Flat -> private/local is a simple truncate.
1918 // Flat -> global is no-op
1919 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1920 return true;
1921
1922 const GCNTargetMachine &TM =
1923 static_cast<const GCNTargetMachine &>(getTargetMachine());
1924 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1925}
1926
1934
1936 Type *Ty) const {
1937 // FIXME: Could be smarter if called for vector constants.
1938 return true;
1939}
1940
1942 unsigned Index) const {
1944 return false;
1945
1946 // TODO: Add more cases that are cheap.
1947 return Index == 0;
1948}
1949
1951 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1952 switch (Op) {
1953 case ISD::LOAD:
1954 case ISD::STORE:
1955 return true;
1956 default:
1957 return false;
1958 }
1959 }
1960
1961 // SimplifySetCC uses this function to determine whether or not it should
1962 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1963 if (VT == MVT::i1 && Op == ISD::SETCC)
1964 return false;
1965
1967}
1968
1969SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1970 const SDLoc &SL,
1971 SDValue Chain,
1972 uint64_t Offset) const {
1973 const DataLayout &DL = DAG.getDataLayout();
1976
1977 const ArgDescriptor *InputPtrReg;
1978 const TargetRegisterClass *RC;
1979 LLT ArgTy;
1981
1982 std::tie(InputPtrReg, RC, ArgTy) =
1983 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1984
1985 // We may not have the kernarg segment argument if we have no kernel
1986 // arguments.
1987 if (!InputPtrReg)
1988 return DAG.getConstant(Offset, SL, PtrVT);
1989
1991 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1992 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1993
1994 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1995}
1996
1997SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1998 const SDLoc &SL) const {
2001 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2002}
2003
2004SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2005 const SDLoc &SL) const {
2006
2008 std::optional<uint32_t> KnownSize =
2010 if (KnownSize.has_value())
2011 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2012 return SDValue();
2013}
2014
2015SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2016 const SDLoc &SL, SDValue Val,
2017 bool Signed,
2018 const ISD::InputArg *Arg) const {
2019 // First, if it is a widened vector, narrow it.
2020 if (VT.isVector() &&
2022 EVT NarrowedVT =
2025 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2026 DAG.getConstant(0, SL, MVT::i32));
2027 }
2028
2029 // Then convert the vector elements or scalar value.
2030 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2031 VT.bitsLT(MemVT)) {
2032 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2033 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2034 }
2035
2036 if (MemVT.isFloatingPoint())
2037 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2038 else if (Signed)
2039 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2040 else
2041 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2042
2043 return Val;
2044}
2045
2046SDValue SITargetLowering::lowerKernargMemParameter(
2047 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2048 uint64_t Offset, Align Alignment, bool Signed,
2049 const ISD::InputArg *Arg) const {
2051
2052 // Try to avoid using an extload by loading earlier than the argument address,
2053 // and extracting the relevant bits. The load should hopefully be merged with
2054 // the previous argument.
2055 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2056 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2057 int64_t AlignDownOffset = alignDown(Offset, 4);
2058 int64_t OffsetDiff = Offset - AlignDownOffset;
2059
2060 EVT IntVT = MemVT.changeTypeToInteger();
2061
2062 // TODO: If we passed in the base kernel offset we could have a better
2063 // alignment than 4, but we don't really need it.
2064 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2065 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2068
2069 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2070 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2071
2072 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2073 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2074 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2075
2076
2077 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2078 }
2079
2080 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2081 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2084
2085 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2086 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2087}
2088
2089SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2090 const SDLoc &SL, SDValue Chain,
2091 const ISD::InputArg &Arg) const {
2093 MachineFrameInfo &MFI = MF.getFrameInfo();
2094
2095 if (Arg.Flags.isByVal()) {
2096 unsigned Size = Arg.Flags.getByValSize();
2097 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2098 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2099 }
2100
2101 unsigned ArgOffset = VA.getLocMemOffset();
2102 unsigned ArgSize = VA.getValVT().getStoreSize();
2103
2104 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2105
2106 // Create load nodes to retrieve arguments from the stack.
2107 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2108 SDValue ArgValue;
2109
2110 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2112 MVT MemVT = VA.getValVT();
2113
2114 switch (VA.getLocInfo()) {
2115 default:
2116 break;
2117 case CCValAssign::BCvt:
2118 MemVT = VA.getLocVT();
2119 break;
2120 case CCValAssign::SExt:
2121 ExtType = ISD::SEXTLOAD;
2122 break;
2123 case CCValAssign::ZExt:
2124 ExtType = ISD::ZEXTLOAD;
2125 break;
2126 case CCValAssign::AExt:
2127 ExtType = ISD::EXTLOAD;
2128 break;
2129 }
2130
2131 ArgValue = DAG.getExtLoad(
2132 ExtType, SL, VA.getLocVT(), Chain, FIN,
2134 MemVT);
2135 return ArgValue;
2136}
2137
2138SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2139 const SIMachineFunctionInfo &MFI,
2140 EVT VT,
2142 const ArgDescriptor *Reg = nullptr;
2143 const TargetRegisterClass *RC;
2144 LLT Ty;
2145
2146 CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2147 const ArgDescriptor WorkGroupIDX =
2148 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2149 // If GridZ is not programmed in an entry function then the hardware will set
2150 // it to all zeros, so there is no need to mask the GridY value in the low
2151 // order bits.
2152 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2153 AMDGPU::TTMP7,
2154 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2155 const ArgDescriptor WorkGroupIDZ =
2156 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2157 if (Subtarget->hasArchitectedSGPRs() &&
2159 switch (PVID) {
2161 Reg = &WorkGroupIDX;
2162 RC = &AMDGPU::SReg_32RegClass;
2163 Ty = LLT::scalar(32);
2164 break;
2166 Reg = &WorkGroupIDY;
2167 RC = &AMDGPU::SReg_32RegClass;
2168 Ty = LLT::scalar(32);
2169 break;
2171 Reg = &WorkGroupIDZ;
2172 RC = &AMDGPU::SReg_32RegClass;
2173 Ty = LLT::scalar(32);
2174 break;
2175 default:
2176 break;
2177 }
2178 }
2179
2180 if (!Reg)
2181 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2182 if (!Reg) {
2184 // It's possible for a kernarg intrinsic call to appear in a kernel with
2185 // no allocated segment, in which case we do not add the user sgpr
2186 // argument, so just return null.
2187 return DAG.getConstant(0, SDLoc(), VT);
2188 }
2189
2190 // It's undefined behavior if a function marked with the amdgpu-no-*
2191 // attributes uses the corresponding intrinsic.
2192 return DAG.getUNDEF(VT);
2193 }
2194
2195 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2196}
2197
2199 CallingConv::ID CallConv,
2200 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2201 FunctionType *FType,
2202 SIMachineFunctionInfo *Info) {
2203 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2204 const ISD::InputArg *Arg = &Ins[I];
2205
2206 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2207 "vector type argument should have been split");
2208
2209 // First check if it's a PS input addr.
2210 if (CallConv == CallingConv::AMDGPU_PS &&
2211 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2212 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2213
2214 // Inconveniently only the first part of the split is marked as isSplit,
2215 // so skip to the end. We only want to increment PSInputNum once for the
2216 // entire split argument.
2217 if (Arg->Flags.isSplit()) {
2218 while (!Arg->Flags.isSplitEnd()) {
2219 assert((!Arg->VT.isVector() ||
2220 Arg->VT.getScalarSizeInBits() == 16) &&
2221 "unexpected vector split in ps argument type");
2222 if (!SkipArg)
2223 Splits.push_back(*Arg);
2224 Arg = &Ins[++I];
2225 }
2226 }
2227
2228 if (SkipArg) {
2229 // We can safely skip PS inputs.
2230 Skipped.set(Arg->getOrigArgIndex());
2231 ++PSInputNum;
2232 continue;
2233 }
2234
2235 Info->markPSInputAllocated(PSInputNum);
2236 if (Arg->Used)
2237 Info->markPSInputEnabled(PSInputNum);
2238
2239 ++PSInputNum;
2240 }
2241
2242 Splits.push_back(*Arg);
2243 }
2244}
2245
2246// Allocate special inputs passed in VGPRs.
2248 MachineFunction &MF,
2249 const SIRegisterInfo &TRI,
2250 SIMachineFunctionInfo &Info) const {
2251 const LLT S32 = LLT::scalar(32);
2253
2254 if (Info.hasWorkItemIDX()) {
2255 Register Reg = AMDGPU::VGPR0;
2256 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2257
2258 CCInfo.AllocateReg(Reg);
2259 unsigned Mask = (Subtarget->hasPackedTID() &&
2260 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2261 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2262 }
2263
2264 if (Info.hasWorkItemIDY()) {
2265 assert(Info.hasWorkItemIDX());
2266 if (Subtarget->hasPackedTID()) {
2267 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2268 0x3ff << 10));
2269 } else {
2270 unsigned Reg = AMDGPU::VGPR1;
2271 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2272
2273 CCInfo.AllocateReg(Reg);
2274 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2275 }
2276 }
2277
2278 if (Info.hasWorkItemIDZ()) {
2279 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2280 if (Subtarget->hasPackedTID()) {
2281 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2282 0x3ff << 20));
2283 } else {
2284 unsigned Reg = AMDGPU::VGPR2;
2285 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2286
2287 CCInfo.AllocateReg(Reg);
2288 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2289 }
2290 }
2291}
2292
2293// Try to allocate a VGPR at the end of the argument list, or if no argument
2294// VGPRs are left allocating a stack slot.
2295// If \p Mask is is given it indicates bitfield position in the register.
2296// If \p Arg is given use it with new ]p Mask instead of allocating new.
2297static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2298 ArgDescriptor Arg = ArgDescriptor()) {
2299 if (Arg.isSet())
2300 return ArgDescriptor::createArg(Arg, Mask);
2301
2302 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2303 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2304 if (RegIdx == ArgVGPRs.size()) {
2305 // Spill to stack required.
2306 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2307
2308 return ArgDescriptor::createStack(Offset, Mask);
2309 }
2310
2311 unsigned Reg = ArgVGPRs[RegIdx];
2312 Reg = CCInfo.AllocateReg(Reg);
2313 assert(Reg != AMDGPU::NoRegister);
2314
2315 MachineFunction &MF = CCInfo.getMachineFunction();
2316 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2317 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2318 return ArgDescriptor::createRegister(Reg, Mask);
2319}
2320
2322 const TargetRegisterClass *RC,
2323 unsigned NumArgRegs) {
2324 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2325 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2326 if (RegIdx == ArgSGPRs.size())
2327 report_fatal_error("ran out of SGPRs for arguments");
2328
2329 unsigned Reg = ArgSGPRs[RegIdx];
2330 Reg = CCInfo.AllocateReg(Reg);
2331 assert(Reg != AMDGPU::NoRegister);
2332
2333 MachineFunction &MF = CCInfo.getMachineFunction();
2334 MF.addLiveIn(Reg, RC);
2336}
2337
2338// If this has a fixed position, we still should allocate the register in the
2339// CCInfo state. Technically we could get away with this for values passed
2340// outside of the normal argument range.
2342 const TargetRegisterClass *RC,
2343 MCRegister Reg) {
2344 Reg = CCInfo.AllocateReg(Reg);
2345 assert(Reg != AMDGPU::NoRegister);
2346 MachineFunction &MF = CCInfo.getMachineFunction();
2347 MF.addLiveIn(Reg, RC);
2348}
2349
2350static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2351 if (Arg) {
2352 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2353 Arg.getRegister());
2354 } else
2355 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2356}
2357
2358static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2359 if (Arg) {
2360 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2361 Arg.getRegister());
2362 } else
2363 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2364}
2365
2366/// Allocate implicit function VGPR arguments at the end of allocated user
2367/// arguments.
2369 CCState &CCInfo, MachineFunction &MF,
2370 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2371 const unsigned Mask = 0x3ff;
2372 ArgDescriptor Arg;
2373
2374 if (Info.hasWorkItemIDX()) {
2375 Arg = allocateVGPR32Input(CCInfo, Mask);
2376 Info.setWorkItemIDX(Arg);
2377 }
2378
2379 if (Info.hasWorkItemIDY()) {
2380 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2381 Info.setWorkItemIDY(Arg);
2382 }
2383
2384 if (Info.hasWorkItemIDZ())
2385 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2386}
2387
2388/// Allocate implicit function VGPR arguments in fixed registers.
2390 CCState &CCInfo, MachineFunction &MF,
2391 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2392 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2393 if (!Reg)
2394 report_fatal_error("failed to allocated VGPR for implicit arguments");
2395
2396 const unsigned Mask = 0x3ff;
2397 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2398 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2399 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2400}
2401
2403 CCState &CCInfo,
2404 MachineFunction &MF,
2405 const SIRegisterInfo &TRI,
2406 SIMachineFunctionInfo &Info) const {
2407 auto &ArgInfo = Info.getArgInfo();
2408 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2409
2410 // TODO: Unify handling with private memory pointers.
2411 if (UserSGPRInfo.hasDispatchPtr())
2412 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2413
2414 const Module *M = MF.getFunction().getParent();
2415 if (UserSGPRInfo.hasQueuePtr() &&
2417 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2418
2419 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2420 // constant offset from the kernarg segment.
2421 if (Info.hasImplicitArgPtr())
2422 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2423
2424 if (UserSGPRInfo.hasDispatchID())
2425 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2426
2427 // flat_scratch_init is not applicable for non-kernel functions.
2428
2429 if (Info.hasWorkGroupIDX())
2430 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2431
2432 if (Info.hasWorkGroupIDY())
2433 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2434
2435 if (Info.hasWorkGroupIDZ())
2436 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2437
2438 if (Info.hasLDSKernelId())
2439 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2440}
2441
2442// Allocate special inputs passed in user SGPRs.
2444 MachineFunction &MF,
2445 const SIRegisterInfo &TRI,
2446 SIMachineFunctionInfo &Info) const {
2447 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2448 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2449 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2450 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2451 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2452 }
2453
2454 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2455 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2456 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2457 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2458 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2459 }
2460
2461 if (UserSGPRInfo.hasDispatchPtr()) {
2462 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2463 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2464 CCInfo.AllocateReg(DispatchPtrReg);
2465 }
2466
2467 const Module *M = MF.getFunction().getParent();
2468 if (UserSGPRInfo.hasQueuePtr() &&
2470 Register QueuePtrReg = Info.addQueuePtr(TRI);
2471 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2472 CCInfo.AllocateReg(QueuePtrReg);
2473 }
2474
2475 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2477 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2478 CCInfo.AllocateReg(InputPtrReg);
2479
2480 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2481 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2482 }
2483
2484 if (UserSGPRInfo.hasDispatchID()) {
2485 Register DispatchIDReg = Info.addDispatchID(TRI);
2486 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2487 CCInfo.AllocateReg(DispatchIDReg);
2488 }
2489
2490 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2491 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2492 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2493 CCInfo.AllocateReg(FlatScratchInitReg);
2494 }
2495
2496 if (UserSGPRInfo.hasPrivateSegmentSize()) {
2497 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
2498 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
2499 CCInfo.AllocateReg(PrivateSegmentSizeReg);
2500 }
2501
2502 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2503 // these from the dispatch pointer.
2504}
2505
2506// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2507// sequential starting from the first argument.
2509 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2511 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2512 Function &F = MF.getFunction();
2513 unsigned LastExplicitArgOffset =
2514 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2515 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2516 bool InPreloadSequence = true;
2517 unsigned InIdx = 0;
2518 for (auto &Arg : F.args()) {
2519 if (!InPreloadSequence || !Arg.hasInRegAttr())
2520 break;
2521
2522 int ArgIdx = Arg.getArgNo();
2523 // Don't preload non-original args or parts not in the current preload
2524 // sequence.
2525 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2526 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2527 break;
2528
2529 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2530 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2531 InIdx++) {
2532 assert(ArgLocs[ArgIdx].isMemLoc());
2533 auto &ArgLoc = ArgLocs[InIdx];
2534 const Align KernelArgBaseAlign = Align(16);
2535 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2536 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2537 unsigned NumAllocSGPRs =
2538 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2539
2540 // Arg is preloaded into the previous SGPR.
2541 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2542 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2543 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2544 continue;
2545 }
2546
2547 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2548 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2549 // Check for free user SGPRs for preloading.
2550 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2551 SGPRInfo.getNumFreeUserSGPRs()) {
2552 InPreloadSequence = false;
2553 break;
2554 }
2555
2556 // Preload this argument.
2557 const TargetRegisterClass *RC =
2558 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2559 SmallVectorImpl<MCRegister> *PreloadRegs =
2560 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2561
2562 if (PreloadRegs->size() > 1)
2563 RC = &AMDGPU::SGPR_32RegClass;
2564 for (auto &Reg : *PreloadRegs) {
2565 assert(Reg);
2566 MF.addLiveIn(Reg, RC);
2567 CCInfo.AllocateReg(Reg);
2568 }
2569
2570 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2571 }
2572 }
2573}
2574
2576 const SIRegisterInfo &TRI,
2577 SIMachineFunctionInfo &Info) const {
2578 // Always allocate this last since it is a synthetic preload.
2579 if (Info.hasLDSKernelId()) {
2580 Register Reg = Info.addLDSKernelId();
2581 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2582 CCInfo.AllocateReg(Reg);
2583 }
2584}
2585
2586// Allocate special input registers that are initialized per-wave.
2588 MachineFunction &MF,
2590 CallingConv::ID CallConv,
2591 bool IsShader) const {
2592 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2593 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2594 // Note: user SGPRs are handled by the front-end for graphics shaders
2595 // Pad up the used user SGPRs with dead inputs.
2596
2597 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2598 // before enabling architected SGPRs for workgroup IDs.
2599 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2600
2601 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2602 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2603 // rely on it to reach 16 since if we end up having no stack usage, it will
2604 // not really be added.
2605 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2606 Info.hasWorkGroupIDY() +
2607 Info.hasWorkGroupIDZ() +
2608 Info.hasWorkGroupInfo();
2609 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2610 Register Reg = Info.addReservedUserSGPR();
2611 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2612 CCInfo.AllocateReg(Reg);
2613 }
2614 }
2615
2616 if (!HasArchitectedSGPRs) {
2617 if (Info.hasWorkGroupIDX()) {
2618 Register Reg = Info.addWorkGroupIDX();
2619 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2620 CCInfo.AllocateReg(Reg);
2621 }
2622
2623 if (Info.hasWorkGroupIDY()) {
2624 Register Reg = Info.addWorkGroupIDY();
2625 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2626 CCInfo.AllocateReg(Reg);
2627 }
2628
2629 if (Info.hasWorkGroupIDZ()) {
2630 Register Reg = Info.addWorkGroupIDZ();
2631 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2632 CCInfo.AllocateReg(Reg);
2633 }
2634 }
2635
2636 if (Info.hasWorkGroupInfo()) {
2637 Register Reg = Info.addWorkGroupInfo();
2638 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2639 CCInfo.AllocateReg(Reg);
2640 }
2641
2642 if (Info.hasPrivateSegmentWaveByteOffset()) {
2643 // Scratch wave offset passed in system SGPR.
2644 unsigned PrivateSegmentWaveByteOffsetReg;
2645
2646 if (IsShader) {
2647 PrivateSegmentWaveByteOffsetReg =
2648 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2649
2650 // This is true if the scratch wave byte offset doesn't have a fixed
2651 // location.
2652 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2653 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2654 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2655 }
2656 } else
2657 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2658
2659 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2660 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2661 }
2662
2663 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2664 Info.getNumPreloadedSGPRs() >= 16);
2665}
2666
2668 MachineFunction &MF,
2669 const SIRegisterInfo &TRI,
2670 SIMachineFunctionInfo &Info) {
2671 // Now that we've figured out where the scratch register inputs are, see if
2672 // should reserve the arguments and use them directly.
2673 MachineFrameInfo &MFI = MF.getFrameInfo();
2674 bool HasStackObjects = MFI.hasStackObjects();
2675 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2676
2677 // Record that we know we have non-spill stack objects so we don't need to
2678 // check all stack objects later.
2679 if (HasStackObjects)
2680 Info.setHasNonSpillStackObjects(true);
2681
2682 // Everything live out of a block is spilled with fast regalloc, so it's
2683 // almost certain that spilling will be required.
2684 if (TM.getOptLevel() == CodeGenOptLevel::None)
2685 HasStackObjects = true;
2686
2687 // For now assume stack access is needed in any callee functions, so we need
2688 // the scratch registers to pass in.
2689 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2690
2691 if (!ST.enableFlatScratch()) {
2692 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2693 // If we have stack objects, we unquestionably need the private buffer
2694 // resource. For the Code Object V2 ABI, this will be the first 4 user
2695 // SGPR inputs. We can reserve those and use them directly.
2696
2697 Register PrivateSegmentBufferReg =
2699 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2700 } else {
2701 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2702 // We tentatively reserve the last registers (skipping the last registers
2703 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2704 // we'll replace these with the ones immediately after those which were
2705 // really allocated. In the prologue copies will be inserted from the
2706 // argument to these reserved registers.
2707
2708 // Without HSA, relocations are used for the scratch pointer and the
2709 // buffer resource setup is always inserted in the prologue. Scratch wave
2710 // offset is still in an input SGPR.
2711 Info.setScratchRSrcReg(ReservedBufferReg);
2712 }
2713 }
2714
2716
2717 // For entry functions we have to set up the stack pointer if we use it,
2718 // whereas non-entry functions get this "for free". This means there is no
2719 // intrinsic advantage to using S32 over S34 in cases where we do not have
2720 // calls but do need a frame pointer (i.e. if we are requested to have one
2721 // because frame pointer elimination is disabled). To keep things simple we
2722 // only ever use S32 as the call ABI stack pointer, and so using it does not
2723 // imply we need a separate frame pointer.
2724 //
2725 // Try to use s32 as the SP, but move it if it would interfere with input
2726 // arguments. This won't work with calls though.
2727 //
2728 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2729 // registers.
2730 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2731 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2732 } else {
2734
2735 if (MFI.hasCalls())
2736 report_fatal_error("call in graphics shader with too many input SGPRs");
2737
2738 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2739 if (!MRI.isLiveIn(Reg)) {
2740 Info.setStackPtrOffsetReg(Reg);
2741 break;
2742 }
2743 }
2744
2745 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2746 report_fatal_error("failed to find register for SP");
2747 }
2748
2749 // hasFP should be accurate for entry functions even before the frame is
2750 // finalized, because it does not rely on the known stack size, only
2751 // properties like whether variable sized objects are present.
2752 if (ST.getFrameLowering()->hasFP(MF)) {
2753 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2754 }
2755}
2756
2759 return !Info->isEntryFunction();
2760}
2761
2765
2767 MachineBasicBlock *Entry,
2768 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2770
2771 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2772 if (!IStart)
2773 return;
2774
2775 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2776 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2778 for (const MCPhysReg *I = IStart; *I; ++I) {
2779 const TargetRegisterClass *RC = nullptr;
2780 if (AMDGPU::SReg_64RegClass.contains(*I))
2781 RC = &AMDGPU::SGPR_64RegClass;
2782 else if (AMDGPU::SReg_32RegClass.contains(*I))
2783 RC = &AMDGPU::SGPR_32RegClass;
2784 else
2785 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2786
2787 Register NewVR = MRI->createVirtualRegister(RC);
2788 // Create copy from CSR to a virtual register.
2789 Entry->addLiveIn(*I);
2790 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2791 .addReg(*I);
2792
2793 // Insert the copy-back instructions right before the terminator.
2794 for (auto *Exit : Exits)
2795 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2796 TII->get(TargetOpcode::COPY), *I)
2797 .addReg(NewVR);
2798 }
2799}
2800
2802 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2803 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2804 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2806
2808 const Function &Fn = MF.getFunction();
2811
2812 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2813 DiagnosticInfoUnsupported NoGraphicsHSA(
2814 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2815 DAG.getContext()->diagnose(NoGraphicsHSA);
2816 return DAG.getEntryNode();
2817 }
2818
2821 BitVector Skipped(Ins.size());
2822 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2823 *DAG.getContext());
2824
2825 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2826 bool IsKernel = AMDGPU::isKernel(CallConv);
2827 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2828
2829 if (IsGraphics) {
2830 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2831 assert(!UserSGPRInfo.hasDispatchPtr() &&
2832 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2833 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2834 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2835 (void)UserSGPRInfo;
2836 if (!Subtarget->enableFlatScratch())
2837 assert(!UserSGPRInfo.hasFlatScratchInit());
2838 if ((CallConv != CallingConv::AMDGPU_CS &&
2839 CallConv != CallingConv::AMDGPU_Gfx) ||
2840 !Subtarget->hasArchitectedSGPRs())
2841 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2842 !Info->hasWorkGroupIDZ());
2843 }
2844
2845 if (CallConv == CallingConv::AMDGPU_PS) {
2846 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2847
2848 // At least one interpolation mode must be enabled or else the GPU will
2849 // hang.
2850 //
2851 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2852 // set PSInputAddr, the user wants to enable some bits after the compilation
2853 // based on run-time states. Since we can't know what the final PSInputEna
2854 // will look like, so we shouldn't do anything here and the user should take
2855 // responsibility for the correct programming.
2856 //
2857 // Otherwise, the following restrictions apply:
2858 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2859 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2860 // enabled too.
2861 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2862 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2863 CCInfo.AllocateReg(AMDGPU::VGPR0);
2864 CCInfo.AllocateReg(AMDGPU::VGPR1);
2865 Info->markPSInputAllocated(0);
2866 Info->markPSInputEnabled(0);
2867 }
2868 if (Subtarget->isAmdPalOS()) {
2869 // For isAmdPalOS, the user does not enable some bits after compilation
2870 // based on run-time states; the register values being generated here are
2871 // the final ones set in hardware. Therefore we need to apply the
2872 // workaround to PSInputAddr and PSInputEnable together. (The case where
2873 // a bit is set in PSInputAddr but not PSInputEnable is where the
2874 // frontend set up an input arg for a particular interpolation mode, but
2875 // nothing uses that input arg. Really we should have an earlier pass
2876 // that removes such an arg.)
2877 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2878 if ((PsInputBits & 0x7F) == 0 ||
2879 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2880 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2881 }
2882 } else if (IsKernel) {
2883 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2884 } else {
2885 Splits.append(Ins.begin(), Ins.end());
2886 }
2887
2888 if (IsKernel)
2889 analyzeFormalArgumentsCompute(CCInfo, Ins);
2890
2891 if (IsEntryFunc) {
2892 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2893 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2894 if (IsKernel && Subtarget->hasKernargPreload())
2895 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2896
2897 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2898 } else if (!IsGraphics) {
2899 // For the fixed ABI, pass workitem IDs in the last argument register.
2900 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2901
2902 // FIXME: Sink this into allocateSpecialInputSGPRs
2903 if (!Subtarget->enableFlatScratch())
2904 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2905
2906 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2907 }
2908
2909 if (!IsKernel) {
2910 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2911 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2912 }
2913
2915
2916 // FIXME: This is the minimum kernel argument alignment. We should improve
2917 // this to the maximum alignment of the arguments.
2918 //
2919 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2920 // kern arg offset.
2921 const Align KernelArgBaseAlign = Align(16);
2922
2923 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2924 const ISD::InputArg &Arg = Ins[i];
2925 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2926 InVals.push_back(DAG.getUNDEF(Arg.VT));
2927 continue;
2928 }
2929
2930 CCValAssign &VA = ArgLocs[ArgIdx++];
2931 MVT VT = VA.getLocVT();
2932
2933 if (IsEntryFunc && VA.isMemLoc()) {
2934 VT = Ins[i].VT;
2935 EVT MemVT = VA.getLocVT();
2936
2937 const uint64_t Offset = VA.getLocMemOffset();
2938 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2939
2940 if (Arg.Flags.isByRef()) {
2941 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2942
2943 const GCNTargetMachine &TM =
2944 static_cast<const GCNTargetMachine &>(getTargetMachine());
2945 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2946 Arg.Flags.getPointerAddrSpace())) {
2949 }
2950
2951 InVals.push_back(Ptr);
2952 continue;
2953 }
2954
2955 SDValue NewArg;
2956 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2957 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2958 // In this case the argument is packed into the previous preload SGPR.
2959 int64_t AlignDownOffset = alignDown(Offset, 4);
2960 int64_t OffsetDiff = Offset - AlignDownOffset;
2961 EVT IntVT = MemVT.changeTypeToInteger();
2962
2963 const SIMachineFunctionInfo *Info =
2966 Register Reg =
2967 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2968
2969 assert(Reg);
2970 Register VReg = MRI.getLiveInVirtReg(Reg);
2971 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2972
2973 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2974 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2975
2976 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2977 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2978 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2979 Ins[i].Flags.isSExt(), &Ins[i]);
2980
2981 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2982 } else {
2983 const SIMachineFunctionInfo *Info =
2986 const SmallVectorImpl<MCRegister> &PreloadRegs =
2987 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2988
2989 SDValue Copy;
2990 if (PreloadRegs.size() == 1) {
2991 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2992 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2993 NewArg = DAG.getCopyFromReg(
2994 Chain, DL, VReg,
2996 TRI->getRegSizeInBits(*RC)));
2997
2998 } else {
2999 // If the kernarg alignment does not match the alignment of the SGPR
3000 // tuple RC that can accommodate this argument, it will be built up
3001 // via copies from from the individual SGPRs that the argument was
3002 // preloaded to.
3004 for (auto Reg : PreloadRegs) {
3005 Register VReg = MRI.getLiveInVirtReg(Reg);
3006 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3007 Elts.push_back(Copy);
3008 }
3009 NewArg =
3010 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3011 PreloadRegs.size()),
3012 DL, Elts);
3013 }
3014
3015 // If the argument was preloaded to multiple consecutive 32-bit
3016 // registers because of misalignment between addressable SGPR tuples
3017 // and the argument size, we can still assume that because of kernarg
3018 // segment alignment restrictions that NewArg's size is the same as
3019 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3020 // truncate since we cannot preload to less than a single SGPR and the
3021 // MemVT may be smaller.
3022 EVT MemVTInt =
3024 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3025 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3026
3027 NewArg = DAG.getBitcast(MemVT, NewArg);
3028 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3029 Ins[i].Flags.isSExt(), &Ins[i]);
3030 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3031 }
3032 } else {
3033 NewArg =
3034 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3035 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3036 }
3037 Chains.push_back(NewArg.getValue(1));
3038
3039 auto *ParamTy =
3040 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3042 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3043 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3044 // On SI local pointers are just offsets into LDS, so they are always
3045 // less than 16-bits. On CI and newer they could potentially be
3046 // real pointers, so we can't guarantee their size.
3047 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3048 DAG.getValueType(MVT::i16));
3049 }
3050
3051 InVals.push_back(NewArg);
3052 continue;
3053 }
3054 if (!IsEntryFunc && VA.isMemLoc()) {
3055 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3056 InVals.push_back(Val);
3057 if (!Arg.Flags.isByVal())
3058 Chains.push_back(Val.getValue(1));
3059 continue;
3060 }
3061
3062 assert(VA.isRegLoc() && "Parameter must be in a register!");
3063
3064 Register Reg = VA.getLocReg();
3065 const TargetRegisterClass *RC = nullptr;
3066 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3067 RC = &AMDGPU::VGPR_32RegClass;
3068 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3069 RC = &AMDGPU::SGPR_32RegClass;
3070 else
3071 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3072 EVT ValVT = VA.getValVT();
3073
3074 Reg = MF.addLiveIn(Reg, RC);
3075 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3076
3077 if (Arg.Flags.isSRet()) {
3078 // The return object should be reasonably addressable.
3079
3080 // FIXME: This helps when the return is a real sret. If it is a
3081 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3082 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3083 unsigned NumBits
3085 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3086 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3087 }
3088
3089 // If this is an 8 or 16-bit value, it is really passed promoted
3090 // to 32 bits. Insert an assert[sz]ext to capture this, then
3091 // truncate to the right size.
3092 switch (VA.getLocInfo()) {
3093 case CCValAssign::Full:
3094 break;
3095 case CCValAssign::BCvt:
3096 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3097 break;
3098 case CCValAssign::SExt:
3099 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3100 DAG.getValueType(ValVT));
3101 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3102 break;
3103 case CCValAssign::ZExt:
3104 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3105 DAG.getValueType(ValVT));
3106 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3107 break;
3108 case CCValAssign::AExt:
3109 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3110 break;
3111 default:
3112 llvm_unreachable("Unknown loc info!");
3113 }
3114
3115 InVals.push_back(Val);
3116 }
3117
3118 // Start adding system SGPRs.
3119 if (IsEntryFunc)
3120 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3121
3122 // DAG.getPass() returns nullptr when using new pass manager.
3123 // TODO: Use DAG.getMFAM() to access analysis result.
3124 if (DAG.getPass()) {
3125 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
3126 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3127 }
3128
3129 unsigned StackArgSize = CCInfo.getStackSize();
3130 Info->setBytesInStackArgArea(StackArgSize);
3131
3132 return Chains.empty() ? Chain :
3133 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3134}
3135
3136// TODO: If return values can't fit in registers, we should return as many as
3137// possible in registers before passing on stack.
3139 CallingConv::ID CallConv,
3140 MachineFunction &MF, bool IsVarArg,
3142 LLVMContext &Context) const {
3143 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3144 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3145 // for shaders. Vector types should be explicitly handled by CC.
3146 if (AMDGPU::isEntryFunctionCC(CallConv))
3147 return true;
3148
3150 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3151 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3152 return false;
3153
3154 // We must use the stack if return would require unavailable registers.
3155 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3156 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3157 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3158 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3159 return false;
3160
3161 return true;
3162}
3163
3164SDValue
3165SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3166 bool isVarArg,
3168 const SmallVectorImpl<SDValue> &OutVals,
3169 const SDLoc &DL, SelectionDAG &DAG) const {
3172
3173 if (AMDGPU::isKernel(CallConv)) {
3174 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3175 OutVals, DL, DAG);
3176 }
3177
3178 bool IsShader = AMDGPU::isShader(CallConv);
3179
3180 Info->setIfReturnsVoid(Outs.empty());
3181 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3182
3183 // CCValAssign - represent the assignment of the return value to a location.
3186
3187 // CCState - Info about the registers and stack slots.
3188 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3189 *DAG.getContext());
3190
3191 // Analyze outgoing return values.
3192 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3193
3194 SDValue Glue;
3196 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3197
3198 // Copy the result values into the output registers.
3199 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3200 ++I, ++RealRVLocIdx) {
3201 CCValAssign &VA = RVLocs[I];
3202 assert(VA.isRegLoc() && "Can only return in registers!");
3203 // TODO: Partially return in registers if return values don't fit.
3204 SDValue Arg = OutVals[RealRVLocIdx];
3205
3206 // Copied from other backends.
3207 switch (VA.getLocInfo()) {
3208 case CCValAssign::Full:
3209 break;
3210 case CCValAssign::BCvt:
3211 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3212 break;
3213 case CCValAssign::SExt:
3214 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3215 break;
3216 case CCValAssign::ZExt:
3217 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3218 break;
3219 case CCValAssign::AExt:
3220 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3221 break;
3222 default:
3223 llvm_unreachable("Unknown loc info!");
3224 }
3225
3226 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3227 Glue = Chain.getValue(1);
3228 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3229 }
3230
3231 // FIXME: Does sret work properly?
3232 if (!Info->isEntryFunction()) {
3233 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3234 const MCPhysReg *I =
3235 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3236 if (I) {
3237 for (; *I; ++I) {
3238 if (AMDGPU::SReg_64RegClass.contains(*I))
3239 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3240 else if (AMDGPU::SReg_32RegClass.contains(*I))
3241 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3242 else
3243 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3244 }
3245 }
3246 }
3247
3248 // Update chain and glue.
3249 RetOps[0] = Chain;
3250 if (Glue.getNode())
3251 RetOps.push_back(Glue);
3252
3253 unsigned Opc = AMDGPUISD::ENDPGM;
3254 if (!IsWaveEnd)
3256 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3257}
3258
3260 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3261 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3262 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3263 SDValue ThisVal) const {
3264 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3265
3266 // Assign locations to each value returned by this call.
3268 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3269 *DAG.getContext());
3270 CCInfo.AnalyzeCallResult(Ins, RetCC);
3271
3272 // Copy all of the result registers out of their specified physreg.
3273 for (CCValAssign VA : RVLocs) {
3274 SDValue Val;
3275
3276 if (VA.isRegLoc()) {
3277 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3278 Chain = Val.getValue(1);
3279 InGlue = Val.getValue(2);
3280 } else if (VA.isMemLoc()) {
3281 report_fatal_error("TODO: return values in memory");
3282 } else
3283 llvm_unreachable("unknown argument location type");
3284
3285 switch (VA.getLocInfo()) {
3286 case CCValAssign::Full:
3287 break;
3288 case CCValAssign::BCvt:
3289 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3290 break;
3291 case CCValAssign::ZExt:
3292 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3293 DAG.getValueType(VA.getValVT()));
3294 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3295 break;
3296 case CCValAssign::SExt:
3297 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3298 DAG.getValueType(VA.getValVT()));
3299 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3300 break;
3301 case CCValAssign::AExt:
3302 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3303 break;
3304 default:
3305 llvm_unreachable("Unknown loc info!");
3306 }
3307
3308 InVals.push_back(Val);
3309 }
3310
3311 return Chain;
3312}
3313
3314// Add code to pass special inputs required depending on used features separate
3315// from the explicit user arguments present in the IR.
3317 CallLoweringInfo &CLI,
3318 CCState &CCInfo,
3319 const SIMachineFunctionInfo &Info,
3320 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3321 SmallVectorImpl<SDValue> &MemOpChains,
3322 SDValue Chain) const {
3323 // If we don't have a call site, this was a call inserted by
3324 // legalization. These can never use special inputs.
3325 if (!CLI.CB)
3326 return;
3327
3328 SelectionDAG &DAG = CLI.DAG;
3329 const SDLoc &DL = CLI.DL;
3330 const Function &F = DAG.getMachineFunction().getFunction();
3331
3332 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3333 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3334
3335 const AMDGPUFunctionArgInfo *CalleeArgInfo
3337 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3338 // DAG.getPass() returns nullptr when using new pass manager.
3339 // TODO: Use DAG.getMFAM() to access analysis result.
3340 if (DAG.getPass()) {
3341 auto &ArgUsageInfo =
3343 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3344 }
3345 }
3346
3347 // TODO: Unify with private memory register handling. This is complicated by
3348 // the fact that at least in kernels, the input argument is not necessarily
3349 // in the same location as the input.
3350 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3352 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3353 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3354 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3355 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3356 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3357 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3358 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3359 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3360 };
3361
3362 for (auto Attr : ImplicitAttrs) {
3363 const ArgDescriptor *OutgoingArg;
3364 const TargetRegisterClass *ArgRC;
3365 LLT ArgTy;
3366
3367 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3368
3369 // If the callee does not use the attribute value, skip copying the value.
3370 if (CLI.CB->hasFnAttr(Attr.second))
3371 continue;
3372
3373 std::tie(OutgoingArg, ArgRC, ArgTy) =
3374 CalleeArgInfo->getPreloadedValue(InputID);
3375 if (!OutgoingArg)
3376 continue;
3377
3378 const ArgDescriptor *IncomingArg;
3379 const TargetRegisterClass *IncomingArgRC;
3380 LLT Ty;
3381 std::tie(IncomingArg, IncomingArgRC, Ty) =
3382 CallerArgInfo.getPreloadedValue(InputID);
3383 assert(IncomingArgRC == ArgRC);
3384
3385 // All special arguments are ints for now.
3386 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3387 SDValue InputReg;
3388
3389 if (IncomingArg) {
3390 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3391 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3392 // The implicit arg ptr is special because it doesn't have a corresponding
3393 // input for kernels, and is computed from the kernarg segment pointer.
3394 InputReg = getImplicitArgPtr(DAG, DL);
3395 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3396 std::optional<uint32_t> Id =
3398 if (Id.has_value()) {
3399 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3400 } else {
3401 InputReg = DAG.getUNDEF(ArgVT);
3402 }
3403 } else {
3404 // We may have proven the input wasn't needed, although the ABI is
3405 // requiring it. We just need to allocate the register appropriately.
3406 InputReg = DAG.getUNDEF(ArgVT);
3407 }
3408
3409 if (OutgoingArg->isRegister()) {
3410 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3411 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3412 report_fatal_error("failed to allocate implicit input argument");
3413 } else {
3414 unsigned SpecialArgOffset =
3415 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3416 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3417 SpecialArgOffset);
3418 MemOpChains.push_back(ArgStore);
3419 }
3420 }
3421
3422 // Pack workitem IDs into a single register or pass it as is if already
3423 // packed.
3424 const ArgDescriptor *OutgoingArg;
3425 const TargetRegisterClass *ArgRC;
3426 LLT Ty;
3427
3428 std::tie(OutgoingArg, ArgRC, Ty) =
3430 if (!OutgoingArg)
3431 std::tie(OutgoingArg, ArgRC, Ty) =
3433 if (!OutgoingArg)
3434 std::tie(OutgoingArg, ArgRC, Ty) =
3436 if (!OutgoingArg)
3437 return;
3438
3439 const ArgDescriptor *IncomingArgX = std::get<0>(
3441 const ArgDescriptor *IncomingArgY = std::get<0>(
3443 const ArgDescriptor *IncomingArgZ = std::get<0>(
3445
3446 SDValue InputReg;
3447 SDLoc SL;
3448
3449 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3450 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3451 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3452
3453 // If incoming ids are not packed we need to pack them.
3454 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3455 NeedWorkItemIDX) {
3456 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3457 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3458 } else {
3459 InputReg = DAG.getConstant(0, DL, MVT::i32);
3460 }
3461 }
3462
3463 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3464 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3465 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3466 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3467 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3468 InputReg = InputReg.getNode() ?
3469 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3470 }
3471
3472 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3473 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3474 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3475 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3476 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3477 InputReg = InputReg.getNode() ?
3478 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3479 }
3480
3481 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3482 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3483 // We're in a situation where the outgoing function requires the workitem
3484 // ID, but the calling function does not have it (e.g a graphics function
3485 // calling a C calling convention function). This is illegal, but we need
3486 // to produce something.
3487 InputReg = DAG.getUNDEF(MVT::i32);
3488 } else {
3489 // Workitem ids are already packed, any of present incoming arguments
3490 // will carry all required fields.
3492 IncomingArgX ? *IncomingArgX :
3493 IncomingArgY ? *IncomingArgY :
3494 *IncomingArgZ, ~0u);
3495 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3496 }
3497 }
3498
3499 if (OutgoingArg->isRegister()) {
3500 if (InputReg)
3501 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3502
3503 CCInfo.AllocateReg(OutgoingArg->getRegister());
3504 } else {
3505 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3506 if (InputReg) {
3507 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3508 SpecialArgOffset);
3509 MemOpChains.push_back(ArgStore);
3510 }
3511 }
3512}
3513
3514static bool canGuaranteeTCO(CallingConv::ID CC) {
3515 return CC == CallingConv::Fast;
3516}
3517
3518/// Return true if we might ever do TCO for calls with this calling convention.
3519static bool mayTailCallThisCC(CallingConv::ID CC) {
3520 switch (CC) {
3521 case CallingConv::C:
3523 return true;
3524 default:
3525 return canGuaranteeTCO(CC);
3526 }
3527}
3528
3530 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3532 const SmallVectorImpl<SDValue> &OutVals,
3533 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3534 if (AMDGPU::isChainCC(CalleeCC))
3535 return true;
3536
3537 if (!mayTailCallThisCC(CalleeCC))
3538 return false;
3539
3540 // For a divergent call target, we need to do a waterfall loop over the
3541 // possible callees which precludes us from using a simple jump.
3542 if (Callee->isDivergent())
3543 return false;
3544
3546 const Function &CallerF = MF.getFunction();
3547 CallingConv::ID CallerCC = CallerF.getCallingConv();
3549 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3550
3551 // Kernels aren't callable, and don't have a live in return address so it
3552 // doesn't make sense to do a tail call with entry functions.
3553 if (!CallerPreserved)
3554 return false;
3555
3556 bool CCMatch = CallerCC == CalleeCC;
3557
3559 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3560 return true;
3561 return false;
3562 }
3563
3564 // TODO: Can we handle var args?
3565 if (IsVarArg)
3566 return false;
3567
3568 for (const Argument &Arg : CallerF.args()) {
3569 if (Arg.hasByValAttr())
3570 return false;
3571 }
3572
3573 LLVMContext &Ctx = *DAG.getContext();
3574
3575 // Check that the call results are passed in the same way.
3576 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3577 CCAssignFnForCall(CalleeCC, IsVarArg),
3578 CCAssignFnForCall(CallerCC, IsVarArg)))
3579 return false;
3580
3581 // The callee has to preserve all registers the caller needs to preserve.
3582 if (!CCMatch) {
3583 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3584 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3585 return false;
3586 }
3587
3588 // Nothing more to check if the callee is taking no arguments.
3589 if (Outs.empty())
3590 return true;
3591
3593 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3594
3595 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3596
3597 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3598 // If the stack arguments for this call do not fit into our own save area then
3599 // the call cannot be made tail.
3600 // TODO: Is this really necessary?
3601 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3602 return false;
3603
3604 const MachineRegisterInfo &MRI = MF.getRegInfo();
3605 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3606}
3607
3609 if (!CI->isTailCall())
3610 return false;
3611
3612 const Function *ParentFn = CI->getParent()->getParent();
3614 return false;
3615 return true;
3616}
3617
3618// The wave scratch offset register is used as the global base pointer.
3620 SmallVectorImpl<SDValue> &InVals) const {
3621 CallingConv::ID CallConv = CLI.CallConv;
3622 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3623
3624 SelectionDAG &DAG = CLI.DAG;
3625
3626 TargetLowering::ArgListEntry RequestedExec;
3627 if (IsChainCallConv) {
3628 // The last argument should be the value that we need to put in EXEC.
3629 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3630 // don't treat it like the rest of the arguments.
3631 RequestedExec = CLI.Args.back();
3632 assert(RequestedExec.Node && "No node for EXEC");
3633
3634 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3635 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3636
3637 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3638 CLI.Outs.pop_back();
3639 CLI.OutVals.pop_back();
3640
3641 if (RequestedExec.Ty->isIntegerTy(64)) {
3642 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3643 CLI.Outs.pop_back();
3644 CLI.OutVals.pop_back();
3645 }
3646
3647 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3648 "Haven't popped all the pieces of the EXEC mask");
3649 }
3650
3651 const SDLoc &DL = CLI.DL;
3653 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3655 SDValue Chain = CLI.Chain;
3656 SDValue Callee = CLI.Callee;
3657 bool &IsTailCall = CLI.IsTailCall;
3658 bool IsVarArg = CLI.IsVarArg;
3659 bool IsSibCall = false;
3661
3662 if (Callee.isUndef() || isNullConstant(Callee)) {
3663 if (!CLI.IsTailCall) {
3664 for (ISD::InputArg &Arg : CLI.Ins)
3665 InVals.push_back(DAG.getUNDEF(Arg.VT));
3666 }
3667
3668 return Chain;
3669 }
3670
3671 if (IsVarArg) {
3672 return lowerUnhandledCall(CLI, InVals,
3673 "unsupported call to variadic function ");
3674 }
3675
3676 if (!CLI.CB)
3677 report_fatal_error("unsupported libcall legalization");
3678
3679 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3680 return lowerUnhandledCall(CLI, InVals,
3681 "unsupported required tail call to function ");
3682 }
3683
3684 if (IsTailCall) {
3686 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3687 if (!IsTailCall &&
3688 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3689 report_fatal_error("failed to perform tail call elimination on a call "
3690 "site marked musttail or on llvm.amdgcn.cs.chain");
3691 }
3692
3693 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3694
3695 // A sibling call is one where we're under the usual C ABI and not planning
3696 // to change that but can still do a tail call:
3697 if (!TailCallOpt && IsTailCall)
3698 IsSibCall = true;
3699
3700 if (IsTailCall)
3701 ++NumTailCalls;
3702 }
3703
3706 SmallVector<SDValue, 8> MemOpChains;
3707
3708 // Analyze operands of the call, assigning locations to each operand.
3710 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3711 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3712
3713 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3714 // With a fixed ABI, allocate fixed registers before user arguments.
3715 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3716 }
3717
3718 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3719
3720 // Get a count of how many bytes are to be pushed on the stack.
3721 unsigned NumBytes = CCInfo.getStackSize();
3722
3723 if (IsSibCall) {
3724 // Since we're not changing the ABI to make this a tail call, the memory
3725 // operands are already available in the caller's incoming argument space.
3726 NumBytes = 0;
3727 }
3728
3729 // FPDiff is the byte offset of the call's argument area from the callee's.
3730 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3731 // by this amount for a tail call. In a sibling call it must be 0 because the
3732 // caller will deallocate the entire stack and the callee still expects its
3733 // arguments to begin at SP+0. Completely unused for non-tail calls.
3734 int32_t FPDiff = 0;
3735 MachineFrameInfo &MFI = MF.getFrameInfo();
3736
3737 // Adjust the stack pointer for the new arguments...
3738 // These operations are automatically eliminated by the prolog/epilog pass
3739 if (!IsSibCall)
3740 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3741
3742 if (!IsSibCall || IsChainCallConv) {
3743 if (!Subtarget->enableFlatScratch()) {
3744 SmallVector<SDValue, 4> CopyFromChains;
3745
3746 // In the HSA case, this should be an identity copy.
3747 SDValue ScratchRSrcReg
3748 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3749 RegsToPass.emplace_back(IsChainCallConv
3750 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3751 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3752 ScratchRSrcReg);
3753 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3754 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3755 }
3756 }
3757
3758 MVT PtrVT = MVT::i32;
3759
3760 // Walk the register/memloc assignments, inserting copies/loads.
3761 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3762 CCValAssign &VA = ArgLocs[i];
3763 SDValue Arg = OutVals[i];
3764
3765 // Promote the value if needed.
3766 switch (VA.getLocInfo()) {
3767 case CCValAssign::Full:
3768 break;
3769 case CCValAssign::BCvt:
3770 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3771 break;
3772 case CCValAssign::ZExt:
3773 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3774 break;
3775 case CCValAssign::SExt:
3776 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3777 break;
3778 case CCValAssign::AExt:
3779 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3780 break;
3781 case CCValAssign::FPExt:
3782 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3783 break;
3784 default:
3785 llvm_unreachable("Unknown loc info!");
3786 }
3787
3788 if (VA.isRegLoc()) {
3789 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3790 } else {
3791 assert(VA.isMemLoc());
3792
3793 SDValue DstAddr;
3794 MachinePointerInfo DstInfo;
3795
3796 unsigned LocMemOffset = VA.getLocMemOffset();
3797 int32_t Offset = LocMemOffset;
3798
3799 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3800 MaybeAlign Alignment;
3801
3802 if (IsTailCall) {
3803 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3804 unsigned OpSize = Flags.isByVal() ?
3805 Flags.getByValSize() : VA.getValVT().getStoreSize();
3806
3807 // FIXME: We can have better than the minimum byval required alignment.
3808 Alignment =
3809 Flags.isByVal()
3810 ? Flags.getNonZeroByValAlign()
3811 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3812
3813 Offset = Offset + FPDiff;
3814 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3815
3816 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3817 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3818
3819 // Make sure any stack arguments overlapping with where we're storing
3820 // are loaded before this eventual operation. Otherwise they'll be
3821 // clobbered.
3822
3823 // FIXME: Why is this really necessary? This seems to just result in a
3824 // lot of code to copy the stack and write them back to the same
3825 // locations, which are supposed to be immutable?
3826 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3827 } else {
3828 // Stores to the argument stack area are relative to the stack pointer.
3829 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3830 MVT::i32);
3831 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3832 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3833 Alignment =
3834 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3835 }
3836
3837 if (Outs[i].Flags.isByVal()) {
3838 SDValue SizeNode =
3839 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3840 SDValue Cpy =
3841 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3842 Outs[i].Flags.getNonZeroByValAlign(),
3843 /*isVol = */ false, /*AlwaysInline = */ true,
3844 /*CI=*/nullptr, std::nullopt, DstInfo,
3846
3847 MemOpChains.push_back(Cpy);
3848 } else {
3849 SDValue Store =
3850 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3851 MemOpChains.push_back(Store);
3852 }
3853 }
3854 }
3855
3856 if (!MemOpChains.empty())
3857 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3858
3859 // Build a sequence of copy-to-reg nodes chained together with token chain
3860 // and flag operands which copy the outgoing args into the appropriate regs.
3861 SDValue InGlue;
3862 for (auto &RegToPass : RegsToPass) {
3863 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3864 RegToPass.second, InGlue);
3865 InGlue = Chain.getValue(1);
3866 }
3867
3868
3869 // We don't usually want to end the call-sequence here because we would tidy
3870 // the frame up *after* the call, however in the ABI-changing tail-call case
3871 // we've carefully laid out the parameters so that when sp is reset they'll be
3872 // in the correct location.
3873 if (IsTailCall && !IsSibCall) {
3874 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3875 InGlue = Chain.getValue(1);
3876 }
3877
3878 std::vector<SDValue> Ops;
3879 Ops.push_back(Chain);
3880 Ops.push_back(Callee);
3881 // Add a redundant copy of the callee global which will not be legalized, as
3882 // we need direct access to the callee later.
3884 const GlobalValue *GV = GSD->getGlobal();
3885 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3886 } else {
3887 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3888 }
3889
3890 if (IsTailCall) {
3891 // Each tail call may have to adjust the stack by a different amount, so
3892 // this information must travel along with the operation for eventual
3893 // consumption by emitEpilogue.
3894 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3895 }
3896
3897 if (IsChainCallConv)
3898 Ops.push_back(RequestedExec.Node);
3899
3900 // Add argument registers to the end of the list so that they are known live
3901 // into the call.
3902 for (auto &RegToPass : RegsToPass) {
3903 Ops.push_back(DAG.getRegister(RegToPass.first,
3904 RegToPass.second.getValueType()));
3905 }
3906
3907 // Add a register mask operand representing the call-preserved registers.
3908 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3909 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3910 assert(Mask && "Missing call preserved mask for calling convention");
3911 Ops.push_back(DAG.getRegisterMask(Mask));
3912
3913 if (SDValue Token = CLI.ConvergenceControlToken) {
3915 GlueOps.push_back(Token);
3916 if (InGlue)
3917 GlueOps.push_back(InGlue);
3918
3919 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
3920 MVT::Glue, GlueOps),
3921 0);
3922 }
3923
3924 if (InGlue)
3925 Ops.push_back(InGlue);
3926
3927 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3928
3929 // If we're doing a tall call, use a TC_RETURN here rather than an
3930 // actual call instruction.
3931 if (IsTailCall) {
3932 MFI.setHasTailCall();
3933 unsigned OPC = AMDGPUISD::TC_RETURN;
3934 switch (CallConv) {
3937 break;
3941 break;
3942 }
3943
3944 return DAG.getNode(OPC, DL, NodeTys, Ops);
3945 }
3946
3947 // Returns a chain and a flag for retval copy to use.
3948 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3949 Chain = Call.getValue(0);
3950 InGlue = Call.getValue(1);
3951
3952 uint64_t CalleePopBytes = NumBytes;
3953 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3954 if (!Ins.empty())
3955 InGlue = Chain.getValue(1);
3956
3957 // Handle result values, copying them out of physregs into vregs that we
3958 // return.
3959 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3960 InVals, /*IsThisReturn=*/false, SDValue());
3961}
3962
3963// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3964// except for applying the wave size scale to the increment amount.
3966 SDValue Op, SelectionDAG &DAG) const {
3967 const MachineFunction &MF = DAG.getMachineFunction();
3969
3970 SDLoc dl(Op);
3971 EVT VT = Op.getValueType();
3972 SDValue Tmp1 = Op;
3973 SDValue Tmp2 = Op.getValue(1);
3974 SDValue Tmp3 = Op.getOperand(2);
3975 SDValue Chain = Tmp1.getOperand(0);
3976
3977 Register SPReg = Info->getStackPtrOffsetReg();
3978
3979 // Chain the dynamic stack allocation so that it doesn't modify the stack
3980 // pointer when other instructions are using the stack.
3981 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3982
3983 SDValue Size = Tmp2.getOperand(1);
3984 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3985 Chain = SP.getValue(1);
3986 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3987 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3988 unsigned Opc =
3991
3992 SDValue ScaledSize = DAG.getNode(
3993 ISD::SHL, dl, VT, Size,
3994 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3995
3996 Align StackAlign = TFL->getStackAlign();
3997 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3998 if (Alignment && *Alignment > StackAlign) {
3999 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
4000 DAG.getConstant(-(uint64_t)Alignment->value()
4001 << Subtarget->getWavefrontSizeLog2(),
4002 dl, VT));
4003 }
4004
4005 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
4006 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4007
4008 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
4009}
4010
4012 SelectionDAG &DAG) const {
4013 // We only handle constant sizes here to allow non-entry block, static sized
4014 // allocas. A truly dynamic value is more difficult to support because we
4015 // don't know if the size value is uniform or not. If the size isn't uniform,
4016 // we would need to do a wave reduction to get the maximum size to know how
4017 // much to increment the uniform stack pointer.
4018 SDValue Size = Op.getOperand(1);
4020 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
4021
4023}
4024
4026 if (Op.getValueType() != MVT::i32)
4027 return Op; // Defer to cannot select error.
4028
4030 SDLoc SL(Op);
4031
4032 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4033
4034 // Convert from wave uniform to swizzled vector address. This should protect
4035 // from any edge cases where the stacksave result isn't directly used with
4036 // stackrestore.
4037 SDValue VectorAddress =
4038 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4039 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4040}
4041
4043 SelectionDAG &DAG) const {
4044 SDLoc SL(Op);
4045 assert(Op.getValueType() == MVT::i32);
4046
4047 uint32_t BothRoundHwReg =
4049 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4050
4051 SDValue IntrinID =
4052 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4053 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4054 Op.getOperand(0), IntrinID, GetRoundBothImm);
4055
4056 // There are two rounding modes, one for f32 and one for f64/f16. We only
4057 // report in the standard value range if both are the same.
4058 //
4059 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4060 // ties away from zero is not supported, and the other values are rotated by
4061 // 1.
4062 //
4063 // If the two rounding modes are not the same, report a target defined value.
4064
4065 // Mode register rounding mode fields:
4066 //
4067 // [1:0] Single-precision round mode.
4068 // [3:2] Double/Half-precision round mode.
4069 //
4070 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4071 //
4072 // Hardware Spec
4073 // Toward-0 3 0
4074 // Nearest Even 0 1
4075 // +Inf 1 2
4076 // -Inf 2 3
4077 // NearestAway0 N/A 4
4078 //
4079 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4080 // table we can index by the raw hardware mode.
4081 //
4082 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4083
4084 SDValue BitTable =
4086
4087 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4088 SDValue RoundModeTimesNumBits =
4089 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4090
4091 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4092 // knew only one mode was demanded.
4093 SDValue TableValue =
4094 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4095 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4096
4097 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4098 SDValue TableEntry =
4099 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4100
4101 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4102 // if it's an extended value.
4103 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4104 SDValue IsStandardValue =
4105 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4106 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4107 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4108 TableEntry, EnumOffset);
4109
4110 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4111}
4112
4114 SelectionDAG &DAG) const {
4115 SDLoc SL(Op);
4116
4117 SDValue NewMode = Op.getOperand(1);
4118 assert(NewMode.getValueType() == MVT::i32);
4119
4120 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4121 // hardware MODE.fp_round values.
4122 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4123 uint32_t ClampedVal = std::min(
4124 static_cast<uint32_t>(ConstMode->getZExtValue()),
4126 NewMode = DAG.getConstant(
4127 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4128 } else {
4129 // If we know the input can only be one of the supported standard modes in
4130 // the range 0-3, we can use a simplified mapping to hardware values.
4131 KnownBits KB = DAG.computeKnownBits(NewMode);
4132 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4133 // The supported standard values are 0-3. The extended values start at 8. We
4134 // need to offset by 4 if the value is in the extended range.
4135
4136 if (UseReducedTable) {
4137 // Truncate to the low 32-bits.
4138 SDValue BitTable = DAG.getConstant(
4139 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4140
4141 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4142 SDValue RoundModeTimesNumBits =
4143 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4144
4145 NewMode =
4146 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4147
4148 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4149 // the table extracted bits into inline immediates.
4150 } else {
4151 // table_index = umin(value, value - 4)
4152 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4153 SDValue BitTable =
4155
4156 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4157 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4158 SDValue IndexVal =
4159 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4160
4161 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4162 SDValue RoundModeTimesNumBits =
4163 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4164
4165 SDValue TableValue =
4166 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4167 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4168
4169 // No need to mask out the high bits since the setreg will ignore them
4170 // anyway.
4171 NewMode = TruncTable;
4172 }
4173
4174 // Insert a readfirstlane in case the value is a VGPR. We could do this
4175 // earlier and keep more operations scalar, but that interferes with
4176 // combining the source.
4177 SDValue ReadFirstLaneID =
4178 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4179 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4180 ReadFirstLaneID, NewMode);
4181 }
4182
4183 // N.B. The setreg will be later folded into s_round_mode on supported
4184 // targets.
4185 SDValue IntrinID =
4186 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4187 uint32_t BothRoundHwReg =
4189 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4190
4191 SDValue SetReg =
4192 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4193 IntrinID, RoundBothImm, NewMode);
4194
4195 return SetReg;
4196}
4197
4199 if (Op->isDivergent())
4200 return SDValue();
4201
4202 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4207 break;
4208 default:
4209 return SDValue();
4210 }
4211
4212 return Op;
4213}
4214
4215// Work around DAG legality rules only based on the result type.
4217 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4218 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4219 EVT SrcVT = Src.getValueType();
4220
4221 if (SrcVT.getScalarType() != MVT::bf16)
4222 return Op;
4223
4224 SDLoc SL(Op);
4225 SDValue BitCast =
4226 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4227
4228 EVT DstVT = Op.getValueType();
4229 if (IsStrict)
4230 llvm_unreachable("Need STRICT_BF16_TO_FP");
4231
4232 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4233}
4234
4236 SDLoc SL(Op);
4237 if (Op.getValueType() != MVT::i64)
4238 return Op;
4239
4240 uint32_t ModeHwReg =
4242 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4243 uint32_t TrapHwReg =
4245 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4246
4247 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4248 SDValue IntrinID =
4249 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4250 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4251 Op.getOperand(0), IntrinID, ModeHwRegImm);
4252 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4253 Op.getOperand(0), IntrinID, TrapHwRegImm);
4254 SDValue TokenReg =
4255 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4256 GetTrapReg.getValue(1));
4257
4258 SDValue CvtPtr =
4259 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4260 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4261
4262 return DAG.getMergeValues({Result, TokenReg}, SL);
4263}
4264
4266 SDLoc SL(Op);
4267 if (Op.getOperand(1).getValueType() != MVT::i64)
4268 return Op;
4269
4270 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4271 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4272 DAG.getConstant(0, SL, MVT::i32));
4273 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4274 DAG.getConstant(1, SL, MVT::i32));
4275
4276 SDValue ReadFirstLaneID =
4277 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4278 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4279 ReadFirstLaneID, NewModeReg);
4280 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4281 ReadFirstLaneID, NewTrapReg);
4282
4283 unsigned ModeHwReg =
4285 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4286 unsigned TrapHwReg =
4288 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4289
4290 SDValue IntrinID =
4291 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4292 SDValue SetModeReg =
4293 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4294 IntrinID, ModeHwRegImm, NewModeReg);
4295 SDValue SetTrapReg =
4296 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4297 IntrinID, TrapHwRegImm, NewTrapReg);
4298 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4299}
4300
4302 const MachineFunction &MF) const {
4304 .Case("m0", AMDGPU::M0)
4305 .Case("exec", AMDGPU::EXEC)
4306 .Case("exec_lo", AMDGPU::EXEC_LO)
4307 .Case("exec_hi", AMDGPU::EXEC_HI)
4308 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4309 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4310 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4311 .Default(Register());
4312
4313 if (Reg == AMDGPU::NoRegister) {
4314 report_fatal_error(Twine("invalid register name \""
4315 + StringRef(RegName) + "\"."));
4316
4317 }
4318
4319 if (!Subtarget->hasFlatScrRegister() &&
4320 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4321 report_fatal_error(Twine("invalid register \""
4322 + StringRef(RegName) + "\" for subtarget."));
4323 }
4324
4325 switch (Reg) {
4326 case AMDGPU::M0:
4327 case AMDGPU::EXEC_LO:
4328 case AMDGPU::EXEC_HI:
4329 case AMDGPU::FLAT_SCR_LO:
4330 case AMDGPU::FLAT_SCR_HI:
4331 if (VT.getSizeInBits() == 32)
4332 return Reg;
4333 break;
4334 case AMDGPU::EXEC:
4335 case AMDGPU::FLAT_SCR:
4336 if (VT.getSizeInBits() == 64)
4337 return Reg;
4338 break;
4339 default:
4340 llvm_unreachable("missing register type checking");
4341 }
4342
4343 report_fatal_error(Twine("invalid type for register \""
4344 + StringRef(RegName) + "\"."));
4345}
4346
4347// If kill is not the last instruction, split the block so kill is always a
4348// proper terminator.
4351 MachineBasicBlock *BB) const {
4352 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4354 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4355 return SplitBB;
4356}
4357
4358// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4359// \p MI will be the only instruction in the loop body block. Otherwise, it will
4360// be the first instruction in the remainder block.
4361//
4362/// \returns { LoopBody, Remainder }
4363static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4367
4368 // To insert the loop we need to split the block. Move everything after this
4369 // point to a new block, and insert a new empty block between the two.
4371 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4373 ++MBBI;
4374
4375 MF->insert(MBBI, LoopBB);
4376 MF->insert(MBBI, RemainderBB);
4377
4378 LoopBB->addSuccessor(LoopBB);
4379 LoopBB->addSuccessor(RemainderBB);
4380
4381 // Move the rest of the block into a new block.
4382 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4383
4384 if (InstInLoop) {
4385 auto Next = std::next(I);
4386
4387 // Move instruction to loop body.
4388 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4389
4390 // Move the rest of the block.
4391 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4392 } else {
4393 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4394 }
4395
4396 MBB.addSuccessor(LoopBB);
4397
4398 return std::pair(LoopBB, RemainderBB);
4399}
4400
4401/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4403 MachineBasicBlock *MBB = MI.getParent();
4405 auto I = MI.getIterator();
4406 auto E = std::next(I);
4407
4408 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4409 .addImm(0);
4410
4411 MIBundleBuilder Bundler(*MBB, I, E);
4412 finalizeBundle(*MBB, Bundler.begin());
4413}
4414
4417 MachineBasicBlock *BB) const {
4418 const DebugLoc &DL = MI.getDebugLoc();
4419
4421
4422 MachineBasicBlock *LoopBB;
4423 MachineBasicBlock *RemainderBB;
4425
4426 // Apparently kill flags are only valid if the def is in the same block?
4427 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4428 Src->setIsKill(false);
4429
4430 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4431
4432 MachineBasicBlock::iterator I = LoopBB->end();
4433
4434 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4436
4437 // Clear TRAP_STS.MEM_VIOL
4438 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4439 .addImm(0)
4440 .addImm(EncodedReg);
4441
4443
4444 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4445
4446 // Load and check TRAP_STS.MEM_VIOL
4447 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4448 .addImm(EncodedReg);
4449
4450 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4451 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4452 .addReg(Reg, RegState::Kill)
4453 .addImm(0);
4454 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4455 .addMBB(LoopBB);
4456
4457 return RemainderBB;
4458}
4459
4460// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4461// wavefront. If the value is uniform and just happens to be in a VGPR, this
4462// will only do one iteration. In the worst case, this will loop 64 times.
4463//
4464// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4467 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4468 const DebugLoc &DL, const MachineOperand &Idx,
4469 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4470 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4471 Register &SGPRIdxReg) {
4472
4473 MachineFunction *MF = OrigBB.getParent();
4474 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4475 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4477
4478 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4479 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4480 Register NewExec = MRI.createVirtualRegister(BoolRC);
4481 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4482 Register CondReg = MRI.createVirtualRegister(BoolRC);
4483
4484 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4485 .addReg(InitReg)
4486 .addMBB(&OrigBB)
4487 .addReg(ResultReg)
4488 .addMBB(&LoopBB);
4489
4490 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4491 .addReg(InitSaveExecReg)
4492 .addMBB(&OrigBB)
4493 .addReg(NewExec)
4494 .addMBB(&LoopBB);
4495
4496 // Read the next variant <- also loop target.
4497 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4498 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4499
4500 // Compare the just read M0 value to all possible Idx values.
4501 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4502 .addReg(CurrentIdxReg)
4503 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4504
4505 // Update EXEC, save the original EXEC value to VCC.
4506 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4507 : AMDGPU::S_AND_SAVEEXEC_B64),
4508 NewExec)
4509 .addReg(CondReg, RegState::Kill);
4510
4511 MRI.setSimpleHint(NewExec, CondReg);
4512
4513 if (UseGPRIdxMode) {
4514 if (Offset == 0) {
4515 SGPRIdxReg = CurrentIdxReg;
4516 } else {
4517 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4518 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4519 .addReg(CurrentIdxReg, RegState::Kill)
4520 .addImm(Offset);
4521 }
4522 } else {
4523 // Move index from VCC into M0
4524 if (Offset == 0) {
4525 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4526 .addReg(CurrentIdxReg, RegState::Kill);
4527 } else {
4528 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4529 .addReg(CurrentIdxReg, RegState::Kill)
4530 .addImm(Offset);
4531 }
4532 }
4533
4534 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4535 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4536 MachineInstr *InsertPt =
4537 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4538 : AMDGPU::S_XOR_B64_term), Exec)
4539 .addReg(Exec)
4540 .addReg(NewExec);
4541
4542 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4543 // s_cbranch_scc0?
4544
4545 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4546 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4547 .addMBB(&LoopBB);
4548
4549 return InsertPt->getIterator();
4550}
4551
4552// This has slightly sub-optimal regalloc when the source vector is killed by
4553// the read. The register allocator does not understand that the kill is
4554// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4555// subregister from it, using 1 more VGPR than necessary. This was saved when
4556// this was expanded after register allocation.
4559 unsigned InitResultReg, unsigned PhiReg, int Offset,
4560 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4562 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4563 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4565 const DebugLoc &DL = MI.getDebugLoc();
4567
4568 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
4569 Register DstReg = MI.getOperand(0).getReg();
4570 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4571 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4572 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4573 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4574
4575 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4576
4577 // Save the EXEC mask
4578 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4579 .addReg(Exec);
4580
4581 MachineBasicBlock *LoopBB;
4582 MachineBasicBlock *RemainderBB;
4583 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4584
4585 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4586
4587 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4588 InitResultReg, DstReg, PhiReg, TmpExec,
4589 Offset, UseGPRIdxMode, SGPRIdxReg);
4590
4591 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4593 ++MBBI;
4594 MF->insert(MBBI, LandingPad);
4595 LoopBB->removeSuccessor(RemainderBB);
4596 LandingPad->addSuccessor(RemainderBB);
4597 LoopBB->addSuccessor(LandingPad);
4598 MachineBasicBlock::iterator First = LandingPad->begin();
4599 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4600 .addReg(SaveExec);
4601
4602 return InsPt;
4603}
4604
4605// Returns subreg index, offset
4606static std::pair<unsigned, int>
4608 const TargetRegisterClass *SuperRC,
4609 unsigned VecReg,
4610 int Offset) {
4611 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4612
4613 // Skip out of bounds offsets, or else we would end up using an undefined
4614 // register.
4615 if (Offset >= NumElts || Offset < 0)
4616 return std::pair(AMDGPU::sub0, Offset);
4617
4618 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4619}
4620
4623 int Offset) {
4624 MachineBasicBlock *MBB = MI.getParent();
4625 const DebugLoc &DL = MI.getDebugLoc();
4627
4628 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4629
4630 assert(Idx->getReg() != AMDGPU::NoRegister);
4631
4632 if (Offset == 0) {
4633 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4634 } else {
4635 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4636 .add(*Idx)
4637 .addImm(Offset);
4638 }
4639}
4640
4643 int Offset) {
4644 MachineBasicBlock *MBB = MI.getParent();
4645 const DebugLoc &DL = MI.getDebugLoc();
4647
4648 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4649
4650 if (Offset == 0)
4651 return Idx->getReg();
4652
4653 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4654 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4655 .add(*Idx)
4656 .addImm(Offset);
4657 return Tmp;
4658}
4659
4662 const GCNSubtarget &ST) {
4663 const SIInstrInfo *TII = ST.getInstrInfo();
4664 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4667
4668 Register Dst = MI.getOperand(0).getReg();
4669 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4670 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4671 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4672
4673 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4674 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4675
4676 unsigned SubReg;
4677 std::tie(SubReg, Offset)
4678 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4679
4680 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4681
4682 // Check for a SGPR index.
4683 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4685 const DebugLoc &DL = MI.getDebugLoc();
4686
4687 if (UseGPRIdxMode) {
4688 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4689 // to avoid interfering with other uses, so probably requires a new
4690 // optimization pass.
4692
4693 const MCInstrDesc &GPRIDXDesc =
4694 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4695 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4696 .addReg(SrcReg)
4697 .addReg(Idx)
4698 .addImm(SubReg);
4699 } else {
4701
4702 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4703 .addReg(SrcReg, 0, SubReg)
4704 .addReg(SrcReg, RegState::Implicit);
4705 }
4706
4707 MI.eraseFromParent();
4708
4709 return &MBB;
4710 }
4711
4712 // Control flow needs to be inserted if indexing with a VGPR.
4713 const DebugLoc &DL = MI.getDebugLoc();
4715
4716 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4717 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4718
4719 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4720
4721 Register SGPRIdxReg;
4722 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4723 UseGPRIdxMode, SGPRIdxReg);
4724
4725 MachineBasicBlock *LoopBB = InsPt->getParent();
4726
4727 if (UseGPRIdxMode) {
4728 const MCInstrDesc &GPRIDXDesc =
4729 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4730
4731 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4732 .addReg(SrcReg)
4733 .addReg(SGPRIdxReg)
4734 .addImm(SubReg);
4735 } else {
4736 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4737 .addReg(SrcReg, 0, SubReg)
4738 .addReg(SrcReg, RegState::Implicit);
4739 }
4740
4741 MI.eraseFromParent();
4742
4743 return LoopBB;
4744}
4745
4748 const GCNSubtarget &ST) {
4749 const SIInstrInfo *TII = ST.getInstrInfo();
4750 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4753
4754 Register Dst = MI.getOperand(0).getReg();
4755 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4756 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4757 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4758 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4759 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4760 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4761
4762 // This can be an immediate, but will be folded later.
4763 assert(Val->getReg());
4764
4765 unsigned SubReg;
4766 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4767 SrcVec->getReg(),
4768 Offset);
4769 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4770
4771 if (Idx->getReg() == AMDGPU::NoRegister) {
4773 const DebugLoc &DL = MI.getDebugLoc();
4774
4775 assert(Offset == 0);
4776
4777 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4778 .add(*SrcVec)
4779 .add(*Val)
4780 .addImm(SubReg);
4781
4782 MI.eraseFromParent();
4783 return &MBB;
4784 }
4785
4786 // Check for a SGPR index.
4787 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4789 const DebugLoc &DL = MI.getDebugLoc();
4790
4791 if (UseGPRIdxMode) {
4793
4794 const MCInstrDesc &GPRIDXDesc =
4795 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4796 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4797 .addReg(SrcVec->getReg())
4798 .add(*Val)
4799 .addReg(Idx)
4800 .addImm(SubReg);
4801 } else {
4803
4804 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4805 TRI.getRegSizeInBits(*VecRC), 32, false);
4806 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4807 .addReg(SrcVec->getReg())
4808 .add(*Val)
4809 .addImm(SubReg);
4810 }
4811 MI.eraseFromParent();
4812 return &MBB;
4813 }
4814
4815 // Control flow needs to be inserted if indexing with a VGPR.
4816 if (Val->isReg())
4817 MRI.clearKillFlags(Val->getReg());
4818
4819 const DebugLoc &DL = MI.getDebugLoc();
4820
4821 Register PhiReg = MRI.createVirtualRegister(VecRC);
4822
4823 Register SGPRIdxReg;
4824 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4825 UseGPRIdxMode, SGPRIdxReg);
4826 MachineBasicBlock *LoopBB = InsPt->getParent();
4827
4828 if (UseGPRIdxMode) {
4829 const MCInstrDesc &GPRIDXDesc =
4830 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4831
4832 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4833 .addReg(PhiReg)
4834 .add(*Val)
4835 .addReg(SGPRIdxReg)
4836 .addImm(SubReg);
4837 } else {
4838 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4839 TRI.getRegSizeInBits(*VecRC), 32, false);
4840 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4841 .addReg(PhiReg)
4842 .add(*Val)
4843 .addImm(SubReg);
4844 }
4845
4846 MI.eraseFromParent();
4847 return LoopBB;
4848}
4849
4852 const GCNSubtarget &ST,
4853 unsigned Opc) {
4855 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4856 const DebugLoc &DL = MI.getDebugLoc();
4857 const SIInstrInfo *TII = ST.getInstrInfo();
4858
4859 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4860 Register SrcReg = MI.getOperand(1).getReg();
4861 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4862 Register DstReg = MI.getOperand(0).getReg();
4863 MachineBasicBlock *RetBB = nullptr;
4864 if (isSGPR) {
4865 // These operations with a uniform value i.e. SGPR are idempotent.
4866 // Reduced value will be same as given sgpr.
4867 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4868 RetBB = &BB;
4869 } else {
4870 // TODO: Implement DPP Strategy and switch based on immediate strategy
4871 // operand. For now, for all the cases (default, Iterative and DPP we use
4872 // iterative approach by default.)
4873
4874 // To reduce the VGPR using iterative approach, we need to iterate
4875 // over all the active lanes. Lowering consists of ComputeLoop,
4876 // which iterate over only active lanes. We use copy of EXEC register
4877 // as induction variable and every active lane modifies it using bitset0
4878 // so that we will get the next active lane for next iteration.
4880 Register SrcReg = MI.getOperand(1).getReg();
4881
4882 // Create Control flow for loop
4883 // Split MI's Machine Basic block into For loop
4884 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4885
4886 // Create virtual registers required for lowering.
4887 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4888 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4889 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4890 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4891
4892 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4893 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4894 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4895
4896 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4897 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4898
4899 bool IsWave32 = ST.isWave32();
4900 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4901 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4902
4903 // Create initail values of induction variable from Exec, Accumulator and
4904 // insert branch instr to newly created ComputeBlockk
4905 uint32_t InitalValue =
4906 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4907 auto TmpSReg =
4908 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4909 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4910 .addImm(InitalValue);
4911 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4912
4913 // Start constructing ComputeLoop
4914 I = ComputeLoop->end();
4915 auto Accumulator =
4916 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4917 .addReg(InitalValReg)
4918 .addMBB(&BB);
4919 auto ActiveBits =
4920 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4921 .addReg(TmpSReg->getOperand(0).getReg())
4922 .addMBB(&BB);
4923
4924 // Perform the computations
4925 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4926 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4927 .addReg(ActiveBits->getOperand(0).getReg());
4928 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4929 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4930 .addReg(SrcReg)
4931 .addReg(FF1->getOperand(0).getReg());
4932 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4933 .addReg(Accumulator->getOperand(0).getReg())
4934 .addReg(LaneValue->getOperand(0).getReg());
4935
4936 // Manipulate the iterator to get the next active lane
4937 unsigned BITSETOpc =
4938 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4939 auto NewActiveBits =
4940 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4941 .addReg(FF1->getOperand(0).getReg())
4942 .addReg(ActiveBits->getOperand(0).getReg());
4943
4944 // Add phi nodes
4945 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4946 .addMBB(ComputeLoop);
4947 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4948 .addMBB(ComputeLoop);
4949
4950 // Creating branching
4951 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4952 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4953 .addReg(NewActiveBits->getOperand(0).getReg())
4954 .addImm(0);
4955 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4956 .addMBB(ComputeLoop);
4957
4958 RetBB = ComputeEnd;
4959 }
4960 MI.eraseFromParent();
4961 return RetBB;
4962}
4963
4965 MachineInstr &MI, MachineBasicBlock *BB) const {
4966
4968 MachineFunction *MF = BB->getParent();
4970
4971 switch (MI.getOpcode()) {
4972 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4973 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4974 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4975 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4976 case AMDGPU::S_UADDO_PSEUDO:
4977 case AMDGPU::S_USUBO_PSEUDO: {
4978 const DebugLoc &DL = MI.getDebugLoc();
4979 MachineOperand &Dest0 = MI.getOperand(0);
4980 MachineOperand &Dest1 = MI.getOperand(1);
4981 MachineOperand &Src0 = MI.getOperand(2);
4982 MachineOperand &Src1 = MI.getOperand(3);
4983
4984 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4985 ? AMDGPU::S_ADD_I32
4986 : AMDGPU::S_SUB_I32;
4987 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4988
4989 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4990 .addImm(1)
4991 .addImm(0);
4992
4993 MI.eraseFromParent();
4994 return BB;
4995 }
4996 case AMDGPU::S_ADD_U64_PSEUDO:
4997 case AMDGPU::S_SUB_U64_PSEUDO: {
4998 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4999 // For GFX12, we emit s_add_u64 and s_sub_u64.
5000 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5002 const DebugLoc &DL = MI.getDebugLoc();
5003 MachineOperand &Dest = MI.getOperand(0);
5004 MachineOperand &Src0 = MI.getOperand(1);
5005 MachineOperand &Src1 = MI.getOperand(2);
5006 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5007 if (Subtarget->hasScalarAddSub64()) {
5008 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5009 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5010 .add(Src0)
5011 .add(Src1);
5012 } else {
5013 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5014 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5015
5016 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5017 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5018
5019 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5020 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5021 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5022 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5023
5024 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5025 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5026 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5027 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5028
5029 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5030 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5031 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5032 .add(Src0Sub0)
5033 .add(Src1Sub0);
5034 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5035 .add(Src0Sub1)
5036 .add(Src1Sub1);
5037 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5038 .addReg(DestSub0)
5039 .addImm(AMDGPU::sub0)
5040 .addReg(DestSub1)
5041 .addImm(AMDGPU::sub1);
5042 }
5043 MI.eraseFromParent();
5044 return BB;
5045 }
5046 case AMDGPU::V_ADD_U64_PSEUDO:
5047 case AMDGPU::V_SUB_U64_PSEUDO: {
5049 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5050 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5051 const DebugLoc &DL = MI.getDebugLoc();
5052
5053 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5054
5055 MachineOperand &Dest = MI.getOperand(0);
5056 MachineOperand &Src0 = MI.getOperand(1);
5057 MachineOperand &Src1 = MI.getOperand(2);
5058
5059 if (IsAdd && ST.hasLshlAddB64()) {
5060 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5061 Dest.getReg())
5062 .add(Src0)
5063 .addImm(0)
5064 .add(Src1);
5065 TII->legalizeOperands(*Add);
5066 MI.eraseFromParent();
5067 return BB;
5068 }
5069
5070 const auto *CarryRC = TRI->getWaveMaskRegClass();
5071
5072 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5073 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5074
5075 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5076 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5077
5078 const TargetRegisterClass *Src0RC = Src0.isReg()
5079 ? MRI.getRegClass(Src0.getReg())
5080 : &AMDGPU::VReg_64RegClass;
5081 const TargetRegisterClass *Src1RC = Src1.isReg()
5082 ? MRI.getRegClass(Src1.getReg())
5083 : &AMDGPU::VReg_64RegClass;
5084
5085 const TargetRegisterClass *Src0SubRC =
5086 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5087 const TargetRegisterClass *Src1SubRC =
5088 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5089
5090 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5091 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5092 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5093 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5094
5095 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5096 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5097 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5098 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5099
5100 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5101 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5102 .addReg(CarryReg, RegState::Define)
5103 .add(SrcReg0Sub0)
5104 .add(SrcReg1Sub0)
5105 .addImm(0); // clamp bit
5106
5107 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5108 MachineInstr *HiHalf =
5109 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5110 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5111 .add(SrcReg0Sub1)
5112 .add(SrcReg1Sub1)
5113 .addReg(CarryReg, RegState::Kill)
5114 .addImm(0); // clamp bit
5115
5116 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5117 .addReg(DestSub0)
5118 .addImm(AMDGPU::sub0)
5119 .addReg(DestSub1)
5120 .addImm(AMDGPU::sub1);
5121 TII->legalizeOperands(*LoHalf);
5122 TII->legalizeOperands(*HiHalf);
5123 MI.eraseFromParent();
5124 return BB;
5125 }
5126 case AMDGPU::S_ADD_CO_PSEUDO:
5127 case AMDGPU::S_SUB_CO_PSEUDO: {
5128 // This pseudo has a chance to be selected
5129 // only from uniform add/subcarry node. All the VGPR operands
5130 // therefore assumed to be splat vectors.
5132 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5133 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5135 const DebugLoc &DL = MI.getDebugLoc();
5136 MachineOperand &Dest = MI.getOperand(0);
5137 MachineOperand &CarryDest = MI.getOperand(1);
5138 MachineOperand &Src0 = MI.getOperand(2);
5139 MachineOperand &Src1 = MI.getOperand(3);
5140 MachineOperand &Src2 = MI.getOperand(4);
5141 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5142 ? AMDGPU::S_ADDC_U32
5143 : AMDGPU::S_SUBB_U32;
5144 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5145 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5146 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5147 .addReg(Src0.getReg());
5148 Src0.setReg(RegOp0);
5149 }
5150 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5151 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5152 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5153 .addReg(Src1.getReg());
5154 Src1.setReg(RegOp1);
5155 }
5156 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5157 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5158 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5159 .addReg(Src2.getReg());
5160 Src2.setReg(RegOp2);
5161 }
5162
5163 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5164 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5165 assert(WaveSize == 64 || WaveSize == 32);
5166
5167 if (WaveSize == 64) {
5168 if (ST.hasScalarCompareEq64()) {
5169 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5170 .addReg(Src2.getReg())
5171 .addImm(0);
5172 } else {
5173 const TargetRegisterClass *SubRC =
5174 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5175 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5176 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5177 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5178 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5179 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5180
5181 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5182 .add(Src2Sub0)
5183 .add(Src2Sub1);
5184
5185 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5186 .addReg(Src2_32, RegState::Kill)
5187 .addImm(0);
5188 }
5189 } else {
5190 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5191 .addReg(Src2.getReg())
5192 .addImm(0);
5193 }
5194
5195 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5196
5197 unsigned SelOpc =
5198 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5199
5200 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5201 .addImm(-1)
5202 .addImm(0);
5203
5204 MI.eraseFromParent();
5205 return BB;
5206 }
5207 case AMDGPU::SI_INIT_M0: {
5208 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5209 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5210 .add(MI.getOperand(0));
5211 MI.eraseFromParent();
5212 return BB;
5213 }
5214 case AMDGPU::GET_GROUPSTATICSIZE: {
5215 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5216 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5217 DebugLoc DL = MI.getDebugLoc();
5218 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5219 .add(MI.getOperand(0))
5220 .addImm(MFI->getLDSSize());
5221 MI.eraseFromParent();
5222 return BB;
5223 }
5224 case AMDGPU::GET_SHADERCYCLESHILO: {
5227 const DebugLoc &DL = MI.getDebugLoc();
5228 // The algorithm is:
5229 //
5230 // hi1 = getreg(SHADER_CYCLES_HI)
5231 // lo1 = getreg(SHADER_CYCLES_LO)
5232 // hi2 = getreg(SHADER_CYCLES_HI)
5233 //
5234 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5235 // Otherwise there was overflow and the result is hi2:0. In both cases the
5236 // result should represent the actual time at some point during the sequence
5237 // of three getregs.
5238 using namespace AMDGPU::Hwreg;
5239 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5241 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5242 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5243 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5244 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5245 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5246 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5247 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5248 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5249 .addReg(RegHi1)
5250 .addReg(RegHi2);
5251 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5252 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5253 .addReg(RegLo1)
5254 .addImm(0);
5255 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5256 .add(MI.getOperand(0))
5257 .addReg(RegLo)
5258 .addImm(AMDGPU::sub0)
5259 .addReg(RegHi2)
5260 .addImm(AMDGPU::sub1);
5261 MI.eraseFromParent();
5262 return BB;
5263 }
5264 case AMDGPU::SI_INDIRECT_SRC_V1:
5265 case AMDGPU::SI_INDIRECT_SRC_V2:
5266 case AMDGPU::SI_INDIRECT_SRC_V4:
5267 case AMDGPU::SI_INDIRECT_SRC_V8:
5268 case AMDGPU::SI_INDIRECT_SRC_V9:
5269 case AMDGPU::SI_INDIRECT_SRC_V10:
5270 case AMDGPU::SI_INDIRECT_SRC_V11:
5271 case AMDGPU::SI_INDIRECT_SRC_V12:
5272 case AMDGPU::SI_INDIRECT_SRC_V16:
5273 case AMDGPU::SI_INDIRECT_SRC_V32:
5274 return emitIndirectSrc(MI, *BB, *getSubtarget());
5275 case AMDGPU::SI_INDIRECT_DST_V1:
5276 case AMDGPU::SI_INDIRECT_DST_V2:
5277 case AMDGPU::SI_INDIRECT_DST_V4:
5278 case AMDGPU::SI_INDIRECT_DST_V8:
5279 case AMDGPU::SI_INDIRECT_DST_V9:
5280 case AMDGPU::SI_INDIRECT_DST_V10:
5281 case AMDGPU::SI_INDIRECT_DST_V11:
5282 case AMDGPU::SI_INDIRECT_DST_V12:
5283 case AMDGPU::SI_INDIRECT_DST_V16:
5284 case AMDGPU::SI_INDIRECT_DST_V32:
5285 return emitIndirectDst(MI, *BB, *getSubtarget());
5286 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5287 case AMDGPU::SI_KILL_I1_PSEUDO:
5288 return splitKillBlock(MI, BB);
5289 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5291 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5292 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5293
5294 Register Dst = MI.getOperand(0).getReg();
5295 const MachineOperand &Src0 = MI.getOperand(1);
5296 const MachineOperand &Src1 = MI.getOperand(2);
5297 const DebugLoc &DL = MI.getDebugLoc();
5298 Register SrcCond = MI.getOperand(3).getReg();
5299
5300 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5301 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5302 const auto *CondRC = TRI->getWaveMaskRegClass();
5303 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5304
5305 const TargetRegisterClass *Src0RC = Src0.isReg()
5306 ? MRI.getRegClass(Src0.getReg())
5307 : &AMDGPU::VReg_64RegClass;
5308 const TargetRegisterClass *Src1RC = Src1.isReg()
5309 ? MRI.getRegClass(Src1.getReg())
5310 : &AMDGPU::VReg_64RegClass;
5311
5312 const TargetRegisterClass *Src0SubRC =
5313 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5314 const TargetRegisterClass *Src1SubRC =
5315 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5316
5317 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5318 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5319 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5320 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5321
5322 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5323 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5324 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5325 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5326
5327 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5328 .addReg(SrcCond);
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5330 .addImm(0)
5331 .add(Src0Sub0)
5332 .addImm(0)
5333 .add(Src1Sub0)
5334 .addReg(SrcCondCopy);
5335 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5336 .addImm(0)
5337 .add(Src0Sub1)
5338 .addImm(0)
5339 .add(Src1Sub1)
5340 .addReg(SrcCondCopy);
5341
5342 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5343 .addReg(DstLo)
5344 .addImm(AMDGPU::sub0)
5345 .addReg(DstHi)
5346 .addImm(AMDGPU::sub1);
5347 MI.eraseFromParent();
5348 return BB;
5349 }
5350 case AMDGPU::SI_BR_UNDEF: {
5352 const DebugLoc &DL = MI.getDebugLoc();
5353 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5354 .add(MI.getOperand(0));
5355 Br->getOperand(1).setIsUndef(); // read undef SCC
5356 MI.eraseFromParent();
5357 return BB;
5358 }
5359 case AMDGPU::ADJCALLSTACKUP:
5360 case AMDGPU::ADJCALLSTACKDOWN: {
5362 MachineInstrBuilder MIB(*MF, &MI);
5363 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5364 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5365 return BB;
5366 }
5367 case AMDGPU::SI_CALL_ISEL: {
5369 const DebugLoc &DL = MI.getDebugLoc();
5370
5371 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5372
5374 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5375
5376 for (const MachineOperand &MO : MI.operands())
5377 MIB.add(MO);
5378
5379 MIB.cloneMemRefs(MI);
5380 MI.eraseFromParent();
5381 return BB;
5382 }
5383 case AMDGPU::V_ADD_CO_U32_e32:
5384 case AMDGPU::V_SUB_CO_U32_e32:
5385 case AMDGPU::V_SUBREV_CO_U32_e32: {
5386 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5387 const DebugLoc &DL = MI.getDebugLoc();
5388 unsigned Opc = MI.getOpcode();
5389
5390 bool NeedClampOperand = false;
5391 if (TII->pseudoToMCOpcode(Opc) == -1) {
5392 Opc = AMDGPU::getVOPe64(Opc);
5393 NeedClampOperand = true;
5394 }
5395
5396 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5397 if (TII->isVOP3(*I)) {
5398 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5399 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5400 I.addReg(TRI->getVCC(), RegState::Define);
5401 }
5402 I.add(MI.getOperand(1))
5403 .add(MI.getOperand(2));
5404 if (NeedClampOperand)
5405 I.addImm(0); // clamp bit for e64 encoding
5406
5407 TII->legalizeOperands(*I);
5408
5409 MI.eraseFromParent();
5410 return BB;
5411 }
5412 case AMDGPU::V_ADDC_U32_e32:
5413 case AMDGPU::V_SUBB_U32_e32:
5414 case AMDGPU::V_SUBBREV_U32_e32:
5415 // These instructions have an implicit use of vcc which counts towards the
5416 // constant bus limit.
5417 TII->legalizeOperands(MI);
5418 return BB;
5419 case AMDGPU::DS_GWS_INIT:
5420 case AMDGPU::DS_GWS_SEMA_BR:
5421 case AMDGPU::DS_GWS_BARRIER:
5422 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5423 [[fallthrough]];
5424 case AMDGPU::DS_GWS_SEMA_V:
5425 case AMDGPU::DS_GWS_SEMA_P:
5426 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5427 // A s_waitcnt 0 is required to be the instruction immediately following.
5428 if (getSubtarget()->hasGWSAutoReplay()) {
5430 return BB;
5431 }
5432
5433 return emitGWSMemViolTestLoop(MI, BB);
5434 case AMDGPU::S_SETREG_B32: {
5435 // Try to optimize cases that only set the denormal mode or rounding mode.
5436 //
5437 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5438 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5439 // instead.
5440 //
5441 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5442 // allow you to have a no side effect instruction in the output of a
5443 // sideeffecting pattern.
5444 auto [ID, Offset, Width] =
5445 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5446 if (ID != AMDGPU::Hwreg::ID_MODE)
5447 return BB;
5448
5449 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5450 const unsigned SetMask = WidthMask << Offset;
5451
5452 if (getSubtarget()->hasDenormModeInst()) {
5453 unsigned SetDenormOp = 0;
5454 unsigned SetRoundOp = 0;
5455
5456 // The dedicated instructions can only set the whole denorm or round mode
5457 // at once, not a subset of bits in either.
5458 if (SetMask ==
5460 // If this fully sets both the round and denorm mode, emit the two
5461 // dedicated instructions for these.
5462 SetRoundOp = AMDGPU::S_ROUND_MODE;
5463 SetDenormOp = AMDGPU::S_DENORM_MODE;
5464 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5465 SetRoundOp = AMDGPU::S_ROUND_MODE;
5466 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5467 SetDenormOp = AMDGPU::S_DENORM_MODE;
5468 }
5469
5470 if (SetRoundOp || SetDenormOp) {
5472 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5473 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5474 unsigned ImmVal = Def->getOperand(1).getImm();
5475 if (SetRoundOp) {
5476 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5477 .addImm(ImmVal & 0xf);
5478
5479 // If we also have the denorm mode, get just the denorm mode bits.
5480 ImmVal >>= 4;
5481 }
5482
5483 if (SetDenormOp) {
5484 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5485 .addImm(ImmVal & 0xf);
5486 }
5487
5488 MI.eraseFromParent();
5489 return BB;
5490 }
5491 }
5492 }
5493
5494 // If only FP bits are touched, used the no side effects pseudo.
5495 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5496 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5497 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5498
5499 return BB;
5500 }
5501 case AMDGPU::S_INVERSE_BALLOT_U32:
5502 case AMDGPU::S_INVERSE_BALLOT_U64:
5503 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
5504 // necessary. After that they are equivalent to a COPY.
5505 MI.setDesc(TII->get(AMDGPU::COPY));
5506 return BB;
5507 case AMDGPU::ENDPGM_TRAP: {
5508 const DebugLoc &DL = MI.getDebugLoc();
5509 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5510 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5511 MI.addOperand(MachineOperand::CreateImm(0));
5512 return BB;
5513 }
5514
5515 // We need a block split to make the real endpgm a terminator. We also don't
5516 // want to break phis in successor blocks, so we can't just delete to the
5517 // end of the block.
5518
5519 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5521 MF->push_back(TrapBB);
5522 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5523 .addImm(0);
5524 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5525 .addMBB(TrapBB);
5526
5527 BB->addSuccessor(TrapBB);
5528 MI.eraseFromParent();
5529 return SplitBB;
5530 }
5531 case AMDGPU::SIMULATED_TRAP: {
5532 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5534 MachineBasicBlock *SplitBB =
5535 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5536 MI.eraseFromParent();
5537 return SplitBB;
5538 }
5539 default:
5540 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5541 if (!MI.mayStore())
5543 return BB;
5544 }
5546 }
5547}
5548
5550 // This currently forces unfolding various combinations of fsub into fma with
5551 // free fneg'd operands. As long as we have fast FMA (controlled by
5552 // isFMAFasterThanFMulAndFAdd), we should perform these.
5553
5554 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5555 // most of these combines appear to be cycle neutral but save on instruction
5556 // count / code size.
5557 return true;
5558}
5559
5561
5563 EVT VT) const {
5564 if (!VT.isVector()) {
5565 return MVT::i1;
5566 }
5567 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5568}
5569
5571 // TODO: Should i16 be used always if legal? For now it would force VALU
5572 // shifts.
5573 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5574}
5575
5577 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5578 ? Ty.changeElementSize(16)
5579 : Ty.changeElementSize(32);
5580}
5581
5582// Answering this is somewhat tricky and depends on the specific device which
5583// have different rates for fma or all f64 operations.
5584//
5585// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5586// regardless of which device (although the number of cycles differs between
5587// devices), so it is always profitable for f64.
5588//
5589// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5590// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5591// which we can always do even without fused FP ops since it returns the same
5592// result as the separate operations and since it is always full
5593// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5594// however does not support denormals, so we do report fma as faster if we have
5595// a fast fma device and require denormals.
5596//
5598 EVT VT) const {
5599 VT = VT.getScalarType();
5600
5601 switch (VT.getSimpleVT().SimpleTy) {
5602 case MVT::f32: {
5603 // If mad is not available this depends only on if f32 fma is full rate.
5604 if (!Subtarget->hasMadMacF32Insts())
5605 return Subtarget->hasFastFMAF32();
5606
5607 // Otherwise f32 mad is always full rate and returns the same result as
5608 // the separate operations so should be preferred over fma.
5609 // However does not support denormals.
5611 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5612
5613 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5614 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5615 }
5616 case MVT::f64:
5617 return true;
5618 case MVT::f16:
5619 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5620 default:
5621 break;
5622 }
5623
5624 return false;
5625}
5626
5628 LLT Ty) const {
5629 switch (Ty.getScalarSizeInBits()) {
5630 case 16:
5631 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5632 case 32:
5633 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5634 case 64:
5635 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5636 default:
5637 break;
5638 }
5639
5640 return false;
5641}
5642
5644 if (!Ty.isScalar())
5645 return false;
5646
5647 if (Ty.getScalarSizeInBits() == 16)
5648 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5649 if (Ty.getScalarSizeInBits() == 32)
5650 return Subtarget->hasMadMacF32Insts() &&
5651 denormalModeIsFlushAllF32(*MI.getMF());
5652
5653 return false;
5654}
5655
5657 const SDNode *N) const {
5658 // TODO: Check future ftz flag
5659 // v_mad_f32/v_mac_f32 do not support denormals.
5660 EVT VT = N->getValueType(0);
5661 if (VT == MVT::f32)
5662 return Subtarget->hasMadMacF32Insts() &&
5664 if (VT == MVT::f16) {
5665 return Subtarget->hasMadF16() &&
5667 }
5668
5669 return false;
5670}
5671
5672//===----------------------------------------------------------------------===//
5673// Custom DAG Lowering Operations
5674//===----------------------------------------------------------------------===//
5675
5676// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5677// wider vector type is legal.
5679 SelectionDAG &DAG) const {
5680 unsigned Opc = Op.getOpcode();
5681 EVT VT = Op.getValueType();
5682 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5683 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5684 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5685 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5686
5687 SDValue Lo, Hi;
5688 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5689
5690 SDLoc SL(Op);
5691 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5692 Op->getFlags());
5693 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5694 Op->getFlags());
5695
5696 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5697}
5698
5699// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5700// wider vector type is legal.
5702 SelectionDAG &DAG) const {
5703 unsigned Opc = Op.getOpcode();
5704 EVT VT = Op.getValueType();
5705 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5706 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5707 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5708 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5709
5710 SDValue Lo0, Hi0;
5711 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5712 SDValue Lo1, Hi1;
5713 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5714
5715 SDLoc SL(Op);
5716
5717 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5718 Op->getFlags());
5719 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5720 Op->getFlags());
5721
5722 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5723}
5724
5726 SelectionDAG &DAG) const {
5727 unsigned Opc = Op.getOpcode();
5728 EVT VT = Op.getValueType();
5729 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5730 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5731 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5732 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5733 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5734 VT == MVT::v32bf16);
5735
5736 SDValue Lo0, Hi0;
5737 SDValue Op0 = Op.getOperand(0);
5738 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5739 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5740 : std::pair(Op0, Op0);
5741 SDValue Lo1, Hi1;
5742 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5743 SDValue Lo2, Hi2;
5744 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5745
5746 SDLoc SL(Op);
5747 auto ResVT = DAG.GetSplitDestVTs(VT);
5748
5749 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5750 Op->getFlags());
5751 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5752 Op->getFlags());
5753
5754 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5755}
5756
5757
5759 switch (Op.getOpcode()) {
5760 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5761 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5762 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5763 case ISD::LOAD: {
5764 SDValue Result = LowerLOAD(Op, DAG);
5765 assert((!Result.getNode() ||
5766 Result.getNode()->getNumValues() == 2) &&
5767 "Load should return a value and a chain");
5768 return Result;
5769 }
5770 case ISD::FSQRT: {
5771 EVT VT = Op.getValueType();
5772 if (VT == MVT::f32)
5773 return lowerFSQRTF32(Op, DAG);
5774 if (VT == MVT::f64)
5775 return lowerFSQRTF64(Op, DAG);
5776 return SDValue();
5777 }
5778 case ISD::FSIN:
5779 case ISD::FCOS:
5780 return LowerTrig(Op, DAG);
5781 case ISD::SELECT: return LowerSELECT(Op, DAG);
5782 case ISD::FDIV: return LowerFDIV(Op, DAG);
5783 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5784 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5785 case ISD::STORE: return LowerSTORE(Op, DAG);
5786 case ISD::GlobalAddress: {
5789 return LowerGlobalAddress(MFI, Op, DAG);
5790 }
5791 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5792 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5793 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5794 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5796 return lowerINSERT_SUBVECTOR(Op, DAG);
5798 return lowerINSERT_VECTOR_ELT(Op, DAG);
5800 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5802 return lowerVECTOR_SHUFFLE(Op, DAG);
5804 return lowerSCALAR_TO_VECTOR(Op, DAG);
5805 case ISD::BUILD_VECTOR:
5806 return lowerBUILD_VECTOR(Op, DAG);
5807 case ISD::FP_ROUND:
5809 return lowerFP_ROUND(Op, DAG);
5810 case ISD::TRAP:
5811 return lowerTRAP(Op, DAG);
5812 case ISD::DEBUGTRAP:
5813 return lowerDEBUGTRAP(Op, DAG);
5814 case ISD::ABS:
5815 case ISD::FABS:
5816 case ISD::FNEG:
5817 case ISD::FCANONICALIZE:
5818 case ISD::BSWAP:
5819 return splitUnaryVectorOp(Op, DAG);
5820 case ISD::FMINNUM:
5821 case ISD::FMAXNUM:
5822 return lowerFMINNUM_FMAXNUM(Op, DAG);
5823 case ISD::FLDEXP:
5824 case ISD::STRICT_FLDEXP:
5825 return lowerFLDEXP(Op, DAG);
5826 case ISD::FMA:
5827 return splitTernaryVectorOp(Op, DAG);
5828 case ISD::FP_TO_SINT:
5829 case ISD::FP_TO_UINT:
5830 return LowerFP_TO_INT(Op, DAG);
5831 case ISD::SHL:
5832 case ISD::SRA:
5833 case ISD::SRL:
5834 case ISD::ADD:
5835 case ISD::SUB:
5836 case ISD::SMIN:
5837 case ISD::SMAX:
5838 case ISD::UMIN:
5839 case ISD::UMAX:
5840 case ISD::FADD:
5841 case ISD::FMUL:
5842 case ISD::FMINNUM_IEEE:
5843 case ISD::FMAXNUM_IEEE:
5844 case ISD::FMINIMUM:
5845 case ISD::FMAXIMUM:
5846 case ISD::FMINIMUMNUM:
5847 case ISD::FMAXIMUMNUM:
5848 case ISD::UADDSAT:
5849 case ISD::USUBSAT:
5850 case ISD::SADDSAT:
5851 case ISD::SSUBSAT:
5852 return splitBinaryVectorOp(Op, DAG);
5853 case ISD::MUL:
5854 return lowerMUL(Op, DAG);
5855 case ISD::SMULO:
5856 case ISD::UMULO:
5857 return lowerXMULO(Op, DAG);
5858 case ISD::SMUL_LOHI:
5859 case ISD::UMUL_LOHI:
5860 return lowerXMUL_LOHI(Op, DAG);
5861 case ISD::DYNAMIC_STACKALLOC:
5862 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5863 case ISD::STACKSAVE:
5864 return LowerSTACKSAVE(Op, DAG);
5865 case ISD::GET_ROUNDING:
5866 return lowerGET_ROUNDING(Op, DAG);
5867 case ISD::SET_ROUNDING:
5868 return lowerSET_ROUNDING(Op, DAG);
5869 case ISD::PREFETCH:
5870 return lowerPREFETCH(Op, DAG);
5871 case ISD::FP_EXTEND:
5873 return lowerFP_EXTEND(Op, DAG);
5874 case ISD::GET_FPENV:
5875 return lowerGET_FPENV(Op, DAG);
5876 case ISD::SET_FPENV:
5877 return lowerSET_FPENV(Op, DAG);
5878 }
5879 return SDValue();
5880}
5881
5882// Used for D16: Casts the result of an instruction into the right vector,
5883// packs values if loads return unpacked values.
5885 const SDLoc &DL,
5886 SelectionDAG &DAG, bool Unpacked) {
5887 if (!LoadVT.isVector())
5888 return Result;
5889
5890 // Cast back to the original packed type or to a larger type that is a
5891 // multiple of 32 bit for D16. Widening the return type is a required for
5892 // legalization.
5893 EVT FittingLoadVT = LoadVT;
5894 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5895 FittingLoadVT =
5897 LoadVT.getVectorNumElements() + 1);
5898 }
5899
5900 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5901 // Truncate to v2i16/v4i16.
5902 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5903
5904 // Workaround legalizer not scalarizing truncate after vector op
5905 // legalization but not creating intermediate vector trunc.
5907 DAG.ExtractVectorElements(Result, Elts);
5908 for (SDValue &Elt : Elts)
5909 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5910
5911 // Pad illegal v1i16/v3fi6 to v4i16
5912 if ((LoadVT.getVectorNumElements() % 2) == 1)
5913 Elts.push_back(DAG.getUNDEF(MVT::i16));
5914
5915 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5916
5917 // Bitcast to original type (v2f16/v4f16).
5918 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5919 }
5920
5921 // Cast back to the original packed type.
5922 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5923}
5924
5925SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5926 MemSDNode *M,
5927 SelectionDAG &DAG,
5929 bool IsIntrinsic) const {
5930 SDLoc DL(M);
5931
5932 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5933 EVT LoadVT = M->getValueType(0);
5934
5935 EVT EquivLoadVT = LoadVT;
5936 if (LoadVT.isVector()) {
5937 if (Unpacked) {
5938 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5939 LoadVT.getVectorNumElements());
5940 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5941 // Widen v3f16 to legal type
5942 EquivLoadVT =
5944 LoadVT.getVectorNumElements() + 1);
5945 }
5946 }
5947
5948 // Change from v4f16/v2f16 to EquivLoadVT.
5949 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5950
5952 = DAG.getMemIntrinsicNode(
5953 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5954 VTList, Ops, M->getMemoryVT(),
5955 M->getMemOperand());
5956
5957 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5958
5959 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5960}
5961
5962SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5963 SelectionDAG &DAG,
5964 ArrayRef<SDValue> Ops) const {
5965 SDLoc DL(M);
5966 EVT LoadVT = M->getValueType(0);
5967 EVT EltType = LoadVT.getScalarType();
5968 EVT IntVT = LoadVT.changeTypeToInteger();
5969
5970 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5971
5972 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5973 bool IsTFE = M->getNumValues() == 3;
5974
5975 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
5977 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
5978 : AMDGPUISD::BUFFER_LOAD;
5979
5980 if (IsD16) {
5981 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5982 }
5983
5984 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5985 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5986 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
5987 IsTFE);
5988
5989 if (isTypeLegal(LoadVT)) {
5990 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5991 M->getMemOperand(), DAG);
5992 }
5993
5994 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5995 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5996 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5997 M->getMemOperand(), DAG);
5998 return DAG.getMergeValues(
5999 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
6000 DL);
6001}
6002
6004 SDNode *N, SelectionDAG &DAG) {
6005 EVT VT = N->getValueType(0);
6006 unsigned CondCode = N->getConstantOperandVal(3);
6007 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
6008 return DAG.getUNDEF(VT);
6009
6010 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
6011
6012 SDValue LHS = N->getOperand(1);
6013 SDValue RHS = N->getOperand(2);
6014
6015 SDLoc DL(N);
6016
6017 EVT CmpVT = LHS.getValueType();
6018 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6019 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6021 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6022 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6023 }
6024
6025 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6026
6027 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6028 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6029
6030 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6031 DAG.getCondCode(CCOpcode));
6032 if (VT.bitsEq(CCVT))
6033 return SetCC;
6034 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6035}
6036
6038 SDNode *N, SelectionDAG &DAG) {
6039 EVT VT = N->getValueType(0);
6040
6041 unsigned CondCode = N->getConstantOperandVal(3);
6042 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6043 return DAG.getUNDEF(VT);
6044
6045 SDValue Src0 = N->getOperand(1);
6046 SDValue Src1 = N->getOperand(2);
6047 EVT CmpVT = Src0.getValueType();
6048 SDLoc SL(N);
6049
6050 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6051 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6052 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6053 }
6054
6055 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6056 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6057 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6058 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6059 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6060 Src1, DAG.getCondCode(CCOpcode));
6061 if (VT.bitsEq(CCVT))
6062 return SetCC;
6063 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6064}
6065
6067 SelectionDAG &DAG) {
6068 EVT VT = N->getValueType(0);
6069 SDValue Src = N->getOperand(1);
6070 SDLoc SL(N);
6071
6072 if (Src.getOpcode() == ISD::SETCC) {
6073 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6074 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6075 Src.getOperand(1), Src.getOperand(2));
6076 }
6077 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6078 // (ballot 0) -> 0
6079 if (Arg->isZero())
6080 return DAG.getConstant(0, SL, VT);
6081
6082 // (ballot 1) -> EXEC/EXEC_LO
6083 if (Arg->isOne()) {
6084 Register Exec;
6085 if (VT.getScalarSizeInBits() == 32)
6086 Exec = AMDGPU::EXEC_LO;
6087 else if (VT.getScalarSizeInBits() == 64)
6088 Exec = AMDGPU::EXEC;
6089 else
6090 return SDValue();
6091
6092 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6093 }
6094 }
6095
6096 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6097 // ISD::SETNE)
6098 return DAG.getNode(
6099 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6100 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6101}
6102
6104 SelectionDAG &DAG) {
6105 EVT VT = N->getValueType(0);
6106 unsigned ValSize = VT.getSizeInBits();
6107 unsigned IID = N->getConstantOperandVal(0);
6108 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6109 IID == Intrinsic::amdgcn_permlanex16;
6110 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6111 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6112 SDLoc SL(N);
6113 MVT IntVT = MVT::getIntegerVT(ValSize);
6114
6115 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6116 SDValue Src2, MVT ValT) -> SDValue {
6118 switch (IID) {
6119 case Intrinsic::amdgcn_permlane16:
6120 case Intrinsic::amdgcn_permlanex16:
6121 Operands.push_back(N->getOperand(6));
6122 Operands.push_back(N->getOperand(5));
6123 Operands.push_back(N->getOperand(4));
6124 [[fallthrough]];
6125 case Intrinsic::amdgcn_writelane:
6126 Operands.push_back(Src2);
6127 [[fallthrough]];
6128 case Intrinsic::amdgcn_readlane:
6129 case Intrinsic::amdgcn_set_inactive:
6130 case Intrinsic::amdgcn_set_inactive_chain_arg:
6131 Operands.push_back(Src1);
6132 [[fallthrough]];
6133 case Intrinsic::amdgcn_readfirstlane:
6134 case Intrinsic::amdgcn_permlane64:
6135 Operands.push_back(Src0);
6136 break;
6137 default:
6138 llvm_unreachable("unhandled lane op");
6139 }
6140
6141 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6142 std::reverse(Operands.begin(), Operands.end());
6143
6144 if (SDNode *GL = N->getGluedNode()) {
6145 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6146 GL = GL->getOperand(0).getNode();
6147 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6148 SDValue(GL, 0)));
6149 }
6150
6151 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6152 };
6153
6154 SDValue Src0 = N->getOperand(1);
6155 SDValue Src1, Src2;
6156 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6157 IsSetInactive || IsPermLane16) {
6158 Src1 = N->getOperand(2);
6159 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
6160 Src2 = N->getOperand(3);
6161 }
6162
6163 if (ValSize == 32) {
6164 // Already legal
6165 return SDValue();
6166 }
6167
6168 if (ValSize < 32) {
6169 bool IsFloat = VT.isFloatingPoint();
6170 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6171 SL, MVT::i32);
6172
6173 if (IsSetInactive || IsPermLane16) {
6174 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
6175 SL, MVT::i32);
6176 }
6177
6178 if (IID == Intrinsic::amdgcn_writelane) {
6179 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6180 SL, MVT::i32);
6181 }
6182
6183 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6184 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6185 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6186 }
6187
6188 if (ValSize % 32 != 0)
6189 return SDValue();
6190
6191 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6192 EVT VT = N->getValueType(0);
6193 unsigned NE = VT.getVectorNumElements();
6194 EVT EltVT = VT.getVectorElementType();
6196 unsigned NumOperands = N->getNumOperands();
6197 SmallVector<SDValue, 4> Operands(NumOperands);
6198 SDNode *GL = N->getGluedNode();
6199
6200 // only handle convergencectrl_glue
6202
6203 for (unsigned i = 0; i != NE; ++i) {
6204 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6205 ++j) {
6206 SDValue Operand = N->getOperand(j);
6207 EVT OperandVT = Operand.getValueType();
6208 if (OperandVT.isVector()) {
6209 // A vector operand; extract a single element.
6210 EVT OperandEltVT = OperandVT.getVectorElementType();
6211 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6212 Operand, DAG.getVectorIdxConstant(i, SL));
6213 } else {
6214 // A scalar operand; just use it as is.
6215 Operands[j] = Operand;
6216 }
6217 }
6218
6219 if (GL)
6220 Operands[NumOperands - 1] =
6221 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6222 SDValue(GL->getOperand(0).getNode(), 0));
6223
6224 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6225 }
6226
6227 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6228 return DAG.getBuildVector(VecVT, SL, Scalars);
6229 };
6230
6231 if (VT.isVector()) {
6232 switch (MVT::SimpleValueType EltTy =
6234 case MVT::i32:
6235 case MVT::f32: {
6236 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6237 return unrollLaneOp(LaneOp.getNode());
6238 }
6239 case MVT::i16:
6240 case MVT::f16:
6241 case MVT::bf16: {
6242 MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6244 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
6245 for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6246 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6247 DAG.getConstant(EltIdx, SL, MVT::i32));
6248
6249 if (IsSetInactive || IsPermLane16)
6250 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6251 DAG.getConstant(EltIdx, SL, MVT::i32));
6252
6253 if (IID == Intrinsic::amdgcn_writelane)
6254 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6255 DAG.getConstant(EltIdx, SL, MVT::i32));
6256
6257 Pieces.push_back(
6258 IsSetInactive || IsPermLane16
6259 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6260 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6261 EltIdx += 2;
6262 }
6263 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6264 }
6265 default:
6266 // Handle all other cases by bitcasting to i32 vectors
6267 break;
6268 }
6269 }
6270
6271 MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6272 Src0 = DAG.getBitcast(VecVT, Src0);
6273
6274 if (IsSetInactive || IsPermLane16)
6275 Src1 = DAG.getBitcast(VecVT, Src1);
6276
6277 if (IID == Intrinsic::amdgcn_writelane)
6278 Src2 = DAG.getBitcast(VecVT, Src2);
6279
6280 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6281 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6282 return DAG.getBitcast(VT, UnrolledLaneOp);
6283}
6284
6287 SelectionDAG &DAG) const {
6288 switch (N->getOpcode()) {
6290 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6291 Results.push_back(Res);
6292 return;
6293 }
6295 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6296 Results.push_back(Res);
6297 return;
6298 }
6300 unsigned IID = N->getConstantOperandVal(0);
6301 switch (IID) {
6302 case Intrinsic::amdgcn_make_buffer_rsrc:
6303 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6304 return;
6305 case Intrinsic::amdgcn_cvt_pkrtz: {
6306 SDValue Src0 = N->getOperand(1);
6307 SDValue Src1 = N->getOperand(2);
6308 SDLoc SL(N);
6309 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6310 Src0, Src1);
6311 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6312 return;
6313 }
6314 case Intrinsic::amdgcn_cvt_pknorm_i16:
6315 case Intrinsic::amdgcn_cvt_pknorm_u16:
6316 case Intrinsic::amdgcn_cvt_pk_i16:
6317 case Intrinsic::amdgcn_cvt_pk_u16: {
6318 SDValue Src0 = N->getOperand(1);
6319 SDValue Src1 = N->getOperand(2);
6320 SDLoc SL(N);
6321 unsigned Opcode;
6322
6323 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6325 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6327 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6329 else
6331
6332 EVT VT = N->getValueType(0);
6333 if (isTypeLegal(VT))
6334 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6335 else {
6336 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6337 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6338 }
6339 return;
6340 }
6341 case Intrinsic::amdgcn_s_buffer_load: {
6342 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6343 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6344 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6345 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6346 // s_buffer_load_i8.
6347 if (!Subtarget->hasScalarSubwordLoads())
6348 return;
6349 SDValue Op = SDValue(N, 0);
6350 SDValue Rsrc = Op.getOperand(1);
6351 SDValue Offset = Op.getOperand(2);
6352 SDValue CachePolicy = Op.getOperand(3);
6353 EVT VT = Op.getValueType();
6354 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6355 SDLoc DL(Op);
6357 const DataLayout &DataLayout = DAG.getDataLayout();
6358 Align Alignment =
6364 VT.getStoreSize(), Alignment);
6365 SDValue LoadVal;
6366 if (!Offset->isDivergent()) {
6367 SDValue Ops[] = {Rsrc, // source register
6368 Offset, CachePolicy};
6369 SDValue BufferLoad =
6371 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6372 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6373 } else {
6374 SDValue Ops[] = {
6375 DAG.getEntryNode(), // Chain
6376 Rsrc, // rsrc
6377 DAG.getConstant(0, DL, MVT::i32), // vindex
6378 {}, // voffset
6379 {}, // soffset
6380 {}, // offset
6381 CachePolicy, // cachepolicy
6382 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6383 };
6384 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6385 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6386 }
6387 Results.push_back(LoadVal);
6388 return;
6389 }
6390 }
6391 break;
6392 }
6394 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6395 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6396 // FIXME: Hacky
6397 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6398 Results.push_back(Res.getOperand(I));
6399 }
6400 } else {
6401 Results.push_back(Res);
6402 Results.push_back(Res.getValue(1));
6403 }
6404 return;
6405 }
6406
6407 break;
6408 }
6409 case ISD::SELECT: {
6410 SDLoc SL(N);
6411 EVT VT = N->getValueType(0);
6412 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6413 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6414 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6415
6416 EVT SelectVT = NewVT;
6417 if (NewVT.bitsLT(MVT::i32)) {
6418 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6419 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6420 SelectVT = MVT::i32;
6421 }
6422
6423 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6424 N->getOperand(0), LHS, RHS);
6425
6426 if (NewVT != SelectVT)
6427 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6428 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6429 return;
6430 }
6431 case ISD::FNEG: {
6432 if (N->getValueType(0) != MVT::v2f16)
6433 break;
6434
6435 SDLoc SL(N);
6436 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6437
6438 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6439 BC,
6440 DAG.getConstant(0x80008000, SL, MVT::i32));
6441 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6442 return;
6443 }
6444 case ISD::FABS: {
6445 if (N->getValueType(0) != MVT::v2f16)
6446 break;
6447
6448 SDLoc SL(N);
6449 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6450
6451 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6452 BC,
6453 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6454 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6455 return;
6456 }
6457 case ISD::FSQRT: {
6458 if (N->getValueType(0) != MVT::f16)
6459 break;
6460 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6461 break;
6462 }
6463 default:
6465 break;
6466 }
6467}
6468
6469/// Helper function for LowerBRCOND
6470static SDNode *findUser(SDValue Value, unsigned Opcode) {
6471
6472 SDNode *Parent = Value.getNode();
6473 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6474 I != E; ++I) {
6475
6476 if (I.getUse().get() != Value)
6477 continue;
6478
6479 if (I->getOpcode() == Opcode)
6480 return *I;
6481 }
6482 return nullptr;
6483}
6484
6485unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6486 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6487 switch (Intr->getConstantOperandVal(1)) {
6488 case Intrinsic::amdgcn_if:
6489 return AMDGPUISD::IF;
6490 case Intrinsic::amdgcn_else:
6491 return AMDGPUISD::ELSE;
6492 case Intrinsic::amdgcn_loop:
6493 return AMDGPUISD::LOOP;
6494 case Intrinsic::amdgcn_end_cf:
6495 llvm_unreachable("should not occur");
6496 default:
6497 return 0;
6498 }
6499 }
6500
6501 // break, if_break, else_break are all only used as inputs to loop, not
6502 // directly as branch conditions.
6503 return 0;
6504}
6505
6512
6514 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6515 return false;
6516
6517 // FIXME: Either avoid relying on address space here or change the default
6518 // address space for functions to avoid the explicit check.
6519 return (GV->getValueType()->isFunctionTy() ||
6522}
6523
6525 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6526}
6527
6529 if (!GV->hasExternalLinkage())
6530 return true;
6531
6532 const auto OS = getTargetMachine().getTargetTriple().getOS();
6533 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6534}
6535
6536/// This transforms the control flow intrinsics to get the branch destination as
6537/// last parameter, also switches branch target with BR if the need arise
6538SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6539 SelectionDAG &DAG) const {
6540 SDLoc DL(BRCOND);
6541
6542 SDNode *Intr = BRCOND.getOperand(1).getNode();
6543 SDValue Target = BRCOND.getOperand(2);
6544 SDNode *BR = nullptr;
6545 SDNode *SetCC = nullptr;
6546
6547 if (Intr->getOpcode() == ISD::SETCC) {
6548 // As long as we negate the condition everything is fine
6549 SetCC = Intr;
6550 Intr = SetCC->getOperand(0).getNode();
6551
6552 } else {
6553 // Get the target from BR if we don't negate the condition
6554 BR = findUser(BRCOND, ISD::BR);
6555 assert(BR && "brcond missing unconditional branch user");
6556 Target = BR->getOperand(1);
6557 }
6558
6559 unsigned CFNode = isCFIntrinsic(Intr);
6560 if (CFNode == 0) {
6561 // This is a uniform branch so we don't need to legalize.
6562 return BRCOND;
6563 }
6564
6565 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6566 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6567
6568 assert(!SetCC ||
6569 (SetCC->getConstantOperandVal(1) == 1 &&
6570 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6571 ISD::SETNE));
6572
6573 // operands of the new intrinsic call
6575 if (HaveChain)
6576 Ops.push_back(BRCOND.getOperand(0));
6577
6578 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6579 Ops.push_back(Target);
6580
6581 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6582
6583 // build the new intrinsic call
6584 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6585
6586 if (!HaveChain) {
6587 SDValue Ops[] = {
6588 SDValue(Result, 0),
6589 BRCOND.getOperand(0)
6590 };
6591
6592 Result = DAG.getMergeValues(Ops, DL).getNode();
6593 }
6594
6595 if (BR) {
6596 // Give the branch instruction our target
6597 SDValue Ops[] = {
6598 BR->getOperand(0),
6599 BRCOND.getOperand(2)
6600 };
6601 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6602 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6603 }
6604
6605 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6606
6607 // Copy the intrinsic results to registers
6608 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6610 if (!CopyToReg)
6611 continue;
6612
6613 Chain = DAG.getCopyToReg(
6614 Chain, DL,
6615 CopyToReg->getOperand(1),
6616 SDValue(Result, i - 1),
6617 SDValue());
6618
6619 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6620 }
6621
6622 // Remove the old intrinsic from the chain
6624 SDValue(Intr, Intr->getNumValues() - 1),
6625 Intr->getOperand(0));
6626
6627 return Chain;
6628}
6629
6630SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6631 SelectionDAG &DAG) const {
6632 MVT VT = Op.getSimpleValueType();
6633 SDLoc DL(Op);
6634 // Checking the depth
6635 if (Op.getConstantOperandVal(0) != 0)
6636 return DAG.getConstant(0, DL, VT);
6637
6640 // Check for kernel and shader functions
6641 if (Info->isEntryFunction())
6642 return DAG.getConstant(0, DL, VT);
6643
6644 MachineFrameInfo &MFI = MF.getFrameInfo();
6645 // There is a call to @llvm.returnaddress in this function
6646 MFI.setReturnAddressIsTaken(true);
6647
6649 // Get the return address reg and mark it as an implicit live-in
6650 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6651
6652 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6653}
6654
6655SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6656 SDValue Op,
6657 const SDLoc &DL,
6658 EVT VT) const {
6659 return Op.getValueType().bitsLE(VT) ?
6660 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6661 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6662 DAG.getTargetConstant(0, DL, MVT::i32));
6663}
6664
6665SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6666 assert(Op.getValueType() == MVT::f16 &&
6667 "Do not know how to custom lower FP_ROUND for non-f16 type");
6668
6669 SDValue Src = Op.getOperand(0);
6670 EVT SrcVT = Src.getValueType();
6671 if (SrcVT != MVT::f64)
6672 return Op;
6673
6674 // TODO: Handle strictfp
6675 if (Op.getOpcode() != ISD::FP_ROUND)
6676 return Op;
6677
6678 SDLoc DL(Op);
6679
6680 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6681 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6682 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6683}
6684
6685SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6686 SelectionDAG &DAG) const {
6687 EVT VT = Op.getValueType();
6688 const MachineFunction &MF = DAG.getMachineFunction();
6690 bool IsIEEEMode = Info->getMode().IEEE;
6691
6692 // FIXME: Assert during selection that this is only selected for
6693 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6694 // mode functions, but this happens to be OK since it's only done in cases
6695 // where there is known no sNaN.
6696 if (IsIEEEMode)
6697 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6698
6699 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6700 VT == MVT::v16bf16)
6701 return splitBinaryVectorOp(Op, DAG);
6702 return Op;
6703}
6704
6705SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6706 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6707 EVT VT = Op.getValueType();
6708 assert(VT == MVT::f16);
6709
6710 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6711 EVT ExpVT = Exp.getValueType();
6712 if (ExpVT == MVT::i16)
6713 return Op;
6714
6715 SDLoc DL(Op);
6716
6717 // Correct the exponent type for f16 to i16.
6718 // Clamp the range of the exponent to the instruction's range.
6719
6720 // TODO: This should be a generic narrowing legalization, and can easily be
6721 // for GlobalISel.
6722
6723 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6724 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6725
6726 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6727 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6728
6729 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6730
6731 if (IsStrict) {
6732 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6733 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6734 }
6735
6736 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6737}
6738
6740 switch (Op->getOpcode()) {
6741 case ISD::SRA:
6742 case ISD::SMIN:
6743 case ISD::SMAX:
6744 return ISD::SIGN_EXTEND;
6745 case ISD::SRL:
6746 case ISD::UMIN:
6747 case ISD::UMAX:
6748 return ISD::ZERO_EXTEND;
6749 case ISD::ADD:
6750 case ISD::SUB:
6751 case ISD::AND:
6752 case ISD::OR:
6753 case ISD::XOR:
6754 case ISD::SHL:
6755 case ISD::SELECT:
6756 case ISD::MUL:
6757 // operation result won't be influenced by garbage high bits.
6758 // TODO: are all of those cases correct, and are there more?
6759 return ISD::ANY_EXTEND;
6760 case ISD::SETCC: {
6761 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6763 }
6764 default:
6765 llvm_unreachable("unexpected opcode!");
6766 }
6767}
6768
6769SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
6770 DAGCombinerInfo &DCI) const {
6771 const unsigned Opc = Op.getOpcode();
6772 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
6773 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
6774 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
6775 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
6776 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
6777
6778 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
6779 : Op->getOperand(0).getValueType();
6780 auto ExtTy = OpTy.changeElementType(MVT::i32);
6781
6782 if (DCI.isBeforeLegalizeOps() ||
6783 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
6784 return SDValue();
6785
6786 auto &DAG = DCI.DAG;
6787
6788 SDLoc DL(Op);
6789 SDValue LHS;
6790 SDValue RHS;
6791 if (Opc == ISD::SELECT) {
6792 LHS = Op->getOperand(1);
6793 RHS = Op->getOperand(2);
6794 } else {
6795 LHS = Op->getOperand(0);
6796 RHS = Op->getOperand(1);
6797 }
6798
6799 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
6800 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
6801
6802 // Special case: for shifts, the RHS always needs a zext.
6803 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
6804 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
6805 else
6806 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
6807
6808 // setcc always return i1/i1 vec so no need to truncate after.
6809 if (Opc == ISD::SETCC) {
6810 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
6811 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
6812 }
6813
6814 // For other ops, we extend the operation's return type as well so we need to
6815 // truncate back to the original type.
6816 SDValue NewVal;
6817 if (Opc == ISD::SELECT)
6818 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
6819 else
6820 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
6821
6822 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
6823}
6824
6825// Custom lowering for vector multiplications and s_mul_u64.
6826SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6827 EVT VT = Op.getValueType();
6828
6829 // Split vector operands.
6830 if (VT.isVector())
6831 return splitBinaryVectorOp(Op, DAG);
6832
6833 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6834
6835 // There are four ways to lower s_mul_u64:
6836 //
6837 // 1. If all the operands are uniform, then we lower it as it is.
6838 //
6839 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6840 // multiplications because there is not a vector equivalent of s_mul_u64.
6841 //
6842 // 3. If the cost model decides that it is more efficient to use vector
6843 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6844 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6845 //
6846 // 4. If the cost model decides to use vector registers and both of the
6847 // operands are zero-extended/sign-extended from 32-bits, then we split the
6848 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6849 // possible to check if the operands are zero-extended or sign-extended in
6850 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6851 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6852 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6853 // If the cost model decides that we have to use vector registers, then
6854 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6855 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6856 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6857 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6858 // SIInstrInfo.cpp .
6859
6860 if (Op->isDivergent())
6861 return SDValue();
6862
6863 SDValue Op0 = Op.getOperand(0);
6864 SDValue Op1 = Op.getOperand(1);
6865 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6866 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6867 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6868 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6869 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6870 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6871 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6872 SDLoc SL(Op);
6873 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6874 return SDValue(
6875 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6876 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6877 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6878 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6879 return SDValue(
6880 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6881 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6882 return Op;
6883}
6884
6885SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6886 EVT VT = Op.getValueType();
6887 SDLoc SL(Op);
6888 SDValue LHS = Op.getOperand(0);
6889 SDValue RHS = Op.getOperand(1);
6890 bool isSigned = Op.getOpcode() == ISD::SMULO;
6891
6892 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6893 const APInt &C = RHSC->getAPIntValue();
6894 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6895 if (C.isPowerOf2()) {
6896 // smulo(x, signed_min) is same as umulo(x, signed_min).
6897 bool UseArithShift = isSigned && !C.isMinSignedValue();
6898 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6899 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6900 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6901 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6902 SL, VT, Result, ShiftAmt),
6903 LHS, ISD::SETNE);
6904 return DAG.getMergeValues({ Result, Overflow }, SL);
6905 }
6906 }
6907
6908 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6910 SL, VT, LHS, RHS);
6911
6912 SDValue Sign = isSigned
6913 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6914 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6915 : DAG.getConstant(0, SL, VT);
6916 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6917
6918 return DAG.getMergeValues({ Result, Overflow }, SL);
6919}
6920
6921SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6922 if (Op->isDivergent()) {
6923 // Select to V_MAD_[IU]64_[IU]32.
6924 return Op;
6925 }
6926 if (Subtarget->hasSMulHi()) {
6927 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6928 return SDValue();
6929 }
6930 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6931 // calculate the high part, so we might as well do the whole thing with
6932 // V_MAD_[IU]64_[IU]32.
6933 return Op;
6934}
6935
6936SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6937 if (!Subtarget->isTrapHandlerEnabled() ||
6939 return lowerTrapEndpgm(Op, DAG);
6940
6941 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6942 lowerTrapHsaQueuePtr(Op, DAG);
6943}
6944
6945SDValue SITargetLowering::lowerTrapEndpgm(
6946 SDValue Op, SelectionDAG &DAG) const {
6947 SDLoc SL(Op);
6948 SDValue Chain = Op.getOperand(0);
6949 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6950}
6951
6952SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6953 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6956 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6958 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6961}
6962
6963SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6964 SDValue Op, SelectionDAG &DAG) const {
6965 SDLoc SL(Op);
6966 SDValue Chain = Op.getOperand(0);
6967
6968 SDValue QueuePtr;
6969 // For code object version 5, QueuePtr is passed through implicit kernarg.
6970 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6972 QueuePtr =
6973 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6974 } else {
6977 Register UserSGPR = Info->getQueuePtrUserSGPR();
6978
6979 if (UserSGPR == AMDGPU::NoRegister) {
6980 // We probably are in a function incorrectly marked with
6981 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6982 // trap, so just use a null pointer.
6983 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6984 } else {
6985 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6986 MVT::i64);
6987 }
6988 }
6989
6990 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6991 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6992 QueuePtr, SDValue());
6993
6995 SDValue Ops[] = {
6996 ToReg,
6997 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6998 SGPR01,
6999 ToReg.getValue(1)
7000 };
7001 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7002}
7003
7004SDValue SITargetLowering::lowerTrapHsa(
7005 SDValue Op, SelectionDAG &DAG) const {
7006 SDLoc SL(Op);
7007 SDValue Chain = Op.getOperand(0);
7008
7009 // We need to simulate the 's_trap 2' instruction on targets that run in
7010 // PRIV=1 (where it is treated as a nop).
7011 if (Subtarget->hasPrivEnabledTrap2NopBug())
7012 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
7013
7015 SDValue Ops[] = {
7016 Chain,
7017 DAG.getTargetConstant(TrapID, SL, MVT::i16)
7018 };
7019 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7020}
7021
7022SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
7023 SDLoc SL(Op);
7024 SDValue Chain = Op.getOperand(0);
7026
7027 if (!Subtarget->isTrapHandlerEnabled() ||
7030 "debugtrap handler not supported",
7031 Op.getDebugLoc(),
7032 DS_Warning);
7033 LLVMContext &Ctx = MF.getFunction().getContext();
7034 Ctx.diagnose(NoTrap);
7035 return Chain;
7036 }
7037
7039 SDValue Ops[] = {
7040 Chain,
7041 DAG.getTargetConstant(TrapID, SL, MVT::i16)
7042 };
7043 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
7044}
7045
7046SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
7047 SelectionDAG &DAG) const {
7048 if (Subtarget->hasApertureRegs()) {
7049 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
7050 ? AMDGPU::SRC_SHARED_BASE
7051 : AMDGPU::SRC_PRIVATE_BASE;
7052 // Note: this feature (register) is broken. When used as a 32-bit operand,
7053 // it returns a wrong value (all zeroes?). The real value is in the upper 32
7054 // bits.
7055 //
7056 // To work around the issue, directly emit a 64 bit mov from this register
7057 // then extract the high bits. Note that this shouldn't even result in a
7058 // shift being emitted and simply become a pair of registers (e.g.):
7059 // s_mov_b64 s[6:7], src_shared_base
7060 // v_mov_b32_e32 v1, s7
7061 //
7062 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
7063 // coalescing would kick in and it would think it's okay to use the "HI"
7064 // subregister directly (instead of extracting the HI 32 bits) which is an
7065 // artificial (unusable) register.
7066 // Register TableGen definitions would need an overhaul to get rid of the
7067 // artificial "HI" aperture registers and prevent this kind of issue from
7068 // happening.
7069 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
7070 DAG.getRegister(ApertureRegNo, MVT::i64));
7071 return DAG.getNode(
7072 ISD::TRUNCATE, DL, MVT::i32,
7073 DAG.getNode(ISD::SRL, DL, MVT::i64,
7074 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
7075 }
7076
7077 // For code object version 5, private_base and shared_base are passed through
7078 // implicit kernargs.
7079 const Module *M = DAG.getMachineFunction().getFunction().getParent();
7081 ImplicitParameter Param =
7083 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
7084 }
7085
7088 Register UserSGPR = Info->getQueuePtrUserSGPR();
7089 if (UserSGPR == AMDGPU::NoRegister) {
7090 // We probably are in a function incorrectly marked with
7091 // amdgpu-no-queue-ptr. This is undefined.
7092 return DAG.getUNDEF(MVT::i32);
7093 }
7094
7095 SDValue QueuePtr = CreateLiveInRegister(
7096 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
7097
7098 // Offset into amd_queue_t for group_segment_aperture_base_hi /
7099 // private_segment_aperture_base_hi.
7100 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
7101
7102 SDValue Ptr =
7103 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
7104
7105 // TODO: Use custom target PseudoSourceValue.
7106 // TODO: We should use the value from the IR intrinsic call, but it might not
7107 // be available and how do we get it?
7109 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
7110 commonAlignment(Align(64), StructOffset),
7113}
7114
7115/// Return true if the value is a known valid address, such that a null check is
7116/// not necessary.
7118 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
7121 return true;
7122
7123 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
7124 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
7125
7126 // TODO: Search through arithmetic, handle arguments and loads
7127 // marked nonnull.
7128 return false;
7129}
7130
7131SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
7132 SelectionDAG &DAG) const {
7133 SDLoc SL(Op);
7134
7135 const AMDGPUTargetMachine &TM =
7136 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
7137
7138 unsigned DestAS, SrcAS;
7139 SDValue Src;
7140 bool IsNonNull = false;
7141 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
7142 SrcAS = ASC->getSrcAddressSpace();
7143 Src = ASC->getOperand(0);
7144 DestAS = ASC->getDestAddressSpace();
7145 } else {
7146 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
7147 Op.getConstantOperandVal(0) ==
7148 Intrinsic::amdgcn_addrspacecast_nonnull);
7149 Src = Op->getOperand(1);
7150 SrcAS = Op->getConstantOperandVal(2);
7151 DestAS = Op->getConstantOperandVal(3);
7152 IsNonNull = true;
7153 }
7154
7155 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
7156
7157 // flat -> local/private
7158 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
7159 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
7160 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
7161 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7162
7163 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7164 return Ptr;
7165
7166 unsigned NullVal = TM.getNullPointerValue(DestAS);
7167 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7168 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
7169
7170 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
7171 SegmentNullPtr);
7172 }
7173 }
7174
7175 // local/private -> flat
7176 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
7177 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
7178 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
7179
7180 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
7181 SDValue CvtPtr =
7182 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
7183 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
7184
7185 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
7186 return CvtPtr;
7187
7188 unsigned NullVal = TM.getNullPointerValue(SrcAS);
7189 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
7190
7191 SDValue NonNull
7192 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
7193
7194 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
7195 FlatNullPtr);
7196 }
7197 }
7198
7199 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7200 Op.getValueType() == MVT::i64) {
7201 const SIMachineFunctionInfo *Info =
7203 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
7204 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
7205 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7206 }
7207
7208 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
7209 Src.getValueType() == MVT::i64)
7210 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
7211
7212 // global <-> flat are no-ops and never emitted.
7213
7214 const MachineFunction &MF = DAG.getMachineFunction();
7215 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
7216 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
7217 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
7218
7219 return DAG.getUNDEF(Op->getValueType(0));
7220}
7221
7222// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
7223// the small vector and inserting them into the big vector. That is better than
7224// the default expansion of doing it via a stack slot. Even though the use of
7225// the stack slot would be optimized away afterwards, the stack slot itself
7226// remains.
7227SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
7228 SelectionDAG &DAG) const {
7229 SDValue Vec = Op.getOperand(0);
7230 SDValue Ins = Op.getOperand(1);
7231 SDValue Idx = Op.getOperand(2);
7232 EVT VecVT = Vec.getValueType();
7233 EVT InsVT = Ins.getValueType();
7234 EVT EltVT = VecVT.getVectorElementType();
7235 unsigned InsNumElts = InsVT.getVectorNumElements();
7236 unsigned IdxVal = Idx->getAsZExtVal();
7237 SDLoc SL(Op);
7238
7239 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
7240 // Insert 32-bit registers at a time.
7241 assert(InsNumElts % 2 == 0 && "expect legal vector types");
7242
7243 unsigned VecNumElts = VecVT.getVectorNumElements();
7244 EVT NewVecVT =
7245 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
7246 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
7248 MVT::i32, InsNumElts / 2);
7249
7250 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
7251 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
7252
7253 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
7254 SDValue Elt;
7255 if (InsNumElts == 2) {
7256 Elt = Ins;
7257 } else {
7258 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
7259 DAG.getConstant(I, SL, MVT::i32));
7260 }
7261 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
7262 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
7263 }
7264
7265 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
7266 }
7267
7268 for (unsigned I = 0; I != InsNumElts; ++I) {
7269 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
7270 DAG.getConstant(I, SL, MVT::i32));
7271 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
7272 DAG.getConstant(IdxVal + I, SL, MVT::i32));
7273 }
7274 return Vec;
7275}
7276
7277SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
7278 SelectionDAG &DAG) const {
7279 SDValue Vec = Op.getOperand(0);
7280 SDValue InsVal = Op.getOperand(1);
7281 SDValue Idx = Op.getOperand(2);
7282 EVT VecVT = Vec.getValueType();
7283 EVT EltVT = VecVT.getVectorElementType();
7284 unsigned VecSize = VecVT.getSizeInBits();
7285 unsigned EltSize = EltVT.getSizeInBits();
7286 SDLoc SL(Op);
7287
7288 // Specially handle the case of v4i16 with static indexing.
7289 unsigned NumElts = VecVT.getVectorNumElements();
7290 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7291 if (NumElts == 4 && EltSize == 16 && KIdx) {
7292 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7293
7294 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7295 DAG.getConstant(0, SL, MVT::i32));
7296 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7297 DAG.getConstant(1, SL, MVT::i32));
7298
7299 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7300 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7301
7302 unsigned Idx = KIdx->getZExtValue();
7303 bool InsertLo = Idx < 2;
7304 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7305 InsertLo ? LoVec : HiVec,
7306 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7307 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7308
7309 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7310
7311 SDValue Concat = InsertLo ?
7312 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7313 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7314
7315 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7316 }
7317
7318 // Static indexing does not lower to stack access, and hence there is no need
7319 // for special custom lowering to avoid stack access.
7320 if (isa<ConstantSDNode>(Idx))
7321 return SDValue();
7322
7323 // Avoid stack access for dynamic indexing by custom lowering to
7324 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7325
7326 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7327
7328 MVT IntVT = MVT::getIntegerVT(VecSize);
7329
7330 // Convert vector index to bit-index and get the required bit mask.
7331 assert(isPowerOf2_32(EltSize));
7332 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7333 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7334 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7335 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7336 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7337
7338 // 1. Create a congruent vector with the target value in each element.
7339 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7340 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7341
7342 // 2. Mask off all other indices except the required index within (1).
7343 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7344
7345 // 3. Mask off the required index within the target vector.
7346 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7347 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7348 DAG.getNOT(SL, BFM, IntVT), BCVec);
7349
7350 // 4. Get (2) and (3) ORed into the target vector.
7351 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7352
7353 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7354}
7355
7356SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7357 SelectionDAG &DAG) const {
7358 SDLoc SL(Op);
7359
7360 EVT ResultVT = Op.getValueType();
7361 SDValue Vec = Op.getOperand(0);
7362 SDValue Idx = Op.getOperand(1);
7363 EVT VecVT = Vec.getValueType();
7364 unsigned VecSize = VecVT.getSizeInBits();
7365 EVT EltVT = VecVT.getVectorElementType();
7366
7367 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7368
7369 // Make sure we do any optimizations that will make it easier to fold
7370 // source modifiers before obscuring it with bit operations.
7371
7372 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7373 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7374 return Combined;
7375
7376 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7377 SDValue Lo, Hi;
7378 EVT LoVT, HiVT;
7379 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7380
7381 if (VecSize == 128) {
7382 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7383 Lo = DAG.getBitcast(LoVT,
7384 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7385 DAG.getConstant(0, SL, MVT::i32)));
7386 Hi = DAG.getBitcast(HiVT,
7387 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7388 DAG.getConstant(1, SL, MVT::i32)));
7389 } else if (VecSize == 256) {
7390 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7391 SDValue Parts[4];
7392 for (unsigned P = 0; P < 4; ++P) {
7393 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7394 DAG.getConstant(P, SL, MVT::i32));
7395 }
7396
7397 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7398 Parts[0], Parts[1]));
7399 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7400 Parts[2], Parts[3]));
7401 } else {
7402 assert(VecSize == 512);
7403
7404 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7405 SDValue Parts[8];
7406 for (unsigned P = 0; P < 8; ++P) {
7407 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7408 DAG.getConstant(P, SL, MVT::i32));
7409 }
7410
7411 Lo = DAG.getBitcast(LoVT,
7412 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7413 Parts[0], Parts[1], Parts[2], Parts[3]));
7414 Hi = DAG.getBitcast(HiVT,
7415 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7416 Parts[4], Parts[5],Parts[6], Parts[7]));
7417 }
7418
7419 EVT IdxVT = Idx.getValueType();
7420 unsigned NElem = VecVT.getVectorNumElements();
7421 assert(isPowerOf2_32(NElem));
7422 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7423 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7424 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7425 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7426 }
7427
7428 assert(VecSize <= 64);
7429
7430 MVT IntVT = MVT::getIntegerVT(VecSize);
7431
7432 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7433 SDValue VecBC = peekThroughBitcasts(Vec);
7434 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7435 SDValue Src = VecBC.getOperand(0);
7436 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7437 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7438 }
7439
7440 unsigned EltSize = EltVT.getSizeInBits();
7441 assert(isPowerOf2_32(EltSize));
7442
7443 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7444
7445 // Convert vector index to bit-index (* EltSize)
7446 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7447
7448 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7449 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7450
7451 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7452 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7453 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7454 }
7455
7456 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7457}
7458
7459static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7460 assert(Elt % 2 == 0);
7461 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7462}
7463
7464SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7465 SelectionDAG &DAG) const {
7466 SDLoc SL(Op);
7467 EVT ResultVT = Op.getValueType();
7469
7470 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7471 EVT EltVT = PackVT.getVectorElementType();
7472 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7473
7474 // vector_shuffle <0,1,6,7> lhs, rhs
7475 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7476 //
7477 // vector_shuffle <6,7,2,3> lhs, rhs
7478 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7479 //
7480 // vector_shuffle <6,7,0,1> lhs, rhs
7481 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7482
7483 // Avoid scalarizing when both halves are reading from consecutive elements.
7485 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7486 if (elementPairIsContiguous(SVN->getMask(), I)) {
7487 const int Idx = SVN->getMaskElt(I);
7488 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7489 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7490 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7491 PackVT, SVN->getOperand(VecIdx),
7492 DAG.getConstant(EltIdx, SL, MVT::i32));
7493 Pieces.push_back(SubVec);
7494 } else {
7495 const int Idx0 = SVN->getMaskElt(I);
7496 const int Idx1 = SVN->getMaskElt(I + 1);
7497 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7498 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7499 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7500 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7501
7502 SDValue Vec0 = SVN->getOperand(VecIdx0);
7503 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7504 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7505
7506 SDValue Vec1 = SVN->getOperand(VecIdx1);
7507 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7508 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7509 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7510 }
7511 }
7512
7513 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7514}
7515
7516SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7517 SelectionDAG &DAG) const {
7518 SDValue SVal = Op.getOperand(0);
7519 EVT ResultVT = Op.getValueType();
7520 EVT SValVT = SVal.getValueType();
7521 SDValue UndefVal = DAG.getUNDEF(SValVT);
7522 SDLoc SL(Op);
7523
7525 VElts.push_back(SVal);
7526 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7527 VElts.push_back(UndefVal);
7528
7529 return DAG.getBuildVector(ResultVT, SL, VElts);
7530}
7531
7532SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7533 SelectionDAG &DAG) const {
7534 SDLoc SL(Op);
7535 EVT VT = Op.getValueType();
7536
7537 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
7538 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7539
7540 SDValue Lo = Op.getOperand(0);
7541 SDValue Hi = Op.getOperand(1);
7542
7543 // Avoid adding defined bits with the zero_extend.
7544 if (Hi.isUndef()) {
7545 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7546 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7547 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7548 }
7549
7550 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7551 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7552
7553 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7554 DAG.getConstant(16, SL, MVT::i32));
7555 if (Lo.isUndef())
7556 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7557
7558 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7559 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7560
7561 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7562 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7563 }
7564
7565 // Split into 2-element chunks.
7566 const unsigned NumParts = VT.getVectorNumElements() / 2;
7568 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
7569
7571 for (unsigned P = 0; P < NumParts; ++P) {
7572 SDValue Vec = DAG.getBuildVector(
7573 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
7574 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
7575 }
7576
7577 SDValue Blend =
7578 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
7579 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7580}
7581
7582bool
7584 // OSes that use ELF REL relocations (instead of RELA) can only store a
7585 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7586 // which can create arbitrary 64-bit addends. (This is only a problem for
7587 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7588 // the high 32 bits of the addend.)
7589 //
7590 // This should be kept in sync with how HasRelocationAddend is initialized in
7591 // the constructor of ELFAMDGPUAsmBackend.
7592 if (!Subtarget->isAmdHsaOS())
7593 return false;
7594
7595 // We can fold offsets for anything that doesn't require a GOT relocation.
7596 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7600}
7601
7602static SDValue
7604 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7605 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7606 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7607 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7608 // lowered to the following code sequence:
7609 //
7610 // For constant address space:
7611 // s_getpc_b64 s[0:1]
7612 // s_add_u32 s0, s0, $symbol
7613 // s_addc_u32 s1, s1, 0
7614 //
7615 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7616 // a fixup or relocation is emitted to replace $symbol with a literal
7617 // constant, which is a pc-relative offset from the encoding of the $symbol
7618 // operand to the global variable.
7619 //
7620 // For global address space:
7621 // s_getpc_b64 s[0:1]
7622 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7623 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7624 //
7625 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7626 // fixups or relocations are emitted to replace $symbol@*@lo and
7627 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7628 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7629 // operand to the global variable.
7630 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7631 SDValue PtrHi;
7632 if (GAFlags == SIInstrInfo::MO_NONE)
7633 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7634 else
7635 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7636 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7637}
7638
7639SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7640 SDValue Op,
7641 SelectionDAG &DAG) const {
7643 SDLoc DL(GSD);
7644 EVT PtrVT = Op.getValueType();
7645
7646 const GlobalValue *GV = GSD->getGlobal();
7652 GV->hasExternalLinkage()) {
7653 Type *Ty = GV->getValueType();
7654 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7655 // zero-sized type in other languages to declare the dynamic shared
7656 // memory which size is not known at the compile time. They will be
7657 // allocated by the runtime and placed directly after the static
7658 // allocated ones. They all share the same offset.
7659 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7660 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7661 // Adjust alignment for that dynamic shared memory array.
7664 MFI->setUsesDynamicLDS(true);
7665 return SDValue(
7666 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7667 }
7668 }
7670 }
7671
7673 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7675 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7676 }
7677
7678 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7679 SDValue AddrLo = DAG.getTargetGlobalAddress(
7680 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7681 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7682
7683 SDValue AddrHi = DAG.getTargetGlobalAddress(
7684 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7685 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7686
7687 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7688 }
7689
7690 if (shouldEmitFixup(GV))
7691 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7692
7693 if (shouldEmitPCReloc(GV))
7694 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7696
7697 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7699
7700 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7702 const DataLayout &DataLayout = DAG.getDataLayout();
7703 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7704 MachinePointerInfo PtrInfo
7706
7707 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7710}
7711
7713 const SDLoc &DL, SDValue V) const {
7714 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7715 // the destination register.
7716 //
7717 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7718 // so we will end up with redundant moves to m0.
7719 //
7720 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7721
7722 // A Null SDValue creates a glue result.
7723 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7724 V, Chain);
7725 return SDValue(M0, 0);
7726}
7727
7728SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7729 SDValue Op,
7730 MVT VT,
7731 unsigned Offset) const {
7732 SDLoc SL(Op);
7733 SDValue Param = lowerKernargMemParameter(
7734 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7735 // The local size values will have the hi 16-bits as zero.
7736 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7737 DAG.getValueType(VT));
7738}
7739
7741 EVT VT) {
7743 "non-hsa intrinsic with hsa target",
7744 DL.getDebugLoc());
7745 DAG.getContext()->diagnose(BadIntrin);
7746 return DAG.getUNDEF(VT);
7747}
7748
7750 EVT VT) {
7752 "intrinsic not supported on subtarget",
7753 DL.getDebugLoc());
7754 DAG.getContext()->diagnose(BadIntrin);
7755 return DAG.getUNDEF(VT);
7756}
7757
7759 ArrayRef<SDValue> Elts) {
7760 assert(!Elts.empty());
7761 MVT Type;
7762 unsigned NumElts = Elts.size();
7763
7764 if (NumElts <= 12) {
7765 Type = MVT::getVectorVT(MVT::f32, NumElts);
7766 } else {
7767 assert(Elts.size() <= 16);
7768 Type = MVT::v16f32;
7769 NumElts = 16;
7770 }
7771
7772 SmallVector<SDValue, 16> VecElts(NumElts);
7773 for (unsigned i = 0; i < Elts.size(); ++i) {
7774 SDValue Elt = Elts[i];
7775 if (Elt.getValueType() != MVT::f32)
7776 Elt = DAG.getBitcast(MVT::f32, Elt);
7777 VecElts[i] = Elt;
7778 }
7779 for (unsigned i = Elts.size(); i < NumElts; ++i)
7780 VecElts[i] = DAG.getUNDEF(MVT::f32);
7781
7782 if (NumElts == 1)
7783 return VecElts[0];
7784 return DAG.getBuildVector(Type, DL, VecElts);
7785}
7786
7787static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7788 SDValue Src, int ExtraElts) {
7789 EVT SrcVT = Src.getValueType();
7790
7792
7793 if (SrcVT.isVector())
7794 DAG.ExtractVectorElements(Src, Elts);
7795 else
7796 Elts.push_back(Src);
7797
7798 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7799 while (ExtraElts--)
7800 Elts.push_back(Undef);
7801
7802 return DAG.getBuildVector(CastVT, DL, Elts);
7803}
7804
7805// Re-construct the required return value for a image load intrinsic.
7806// This is more complicated due to the optional use TexFailCtrl which means the required
7807// return type is an aggregate
7809 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7810 bool Unpacked, bool IsD16, int DMaskPop,
7811 int NumVDataDwords, bool IsAtomicPacked16Bit,
7812 const SDLoc &DL) {
7813 // Determine the required return type. This is the same regardless of IsTexFail flag
7814 EVT ReqRetVT = ResultTypes[0];
7815 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7816 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7817 ? (ReqRetNumElts + 1) / 2
7818 : ReqRetNumElts;
7819
7820 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
7821
7822 MVT DataDwordVT = NumDataDwords == 1 ?
7823 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7824
7825 MVT MaskPopVT = MaskPopDwords == 1 ?
7826 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7827
7828 SDValue Data(Result, 0);
7829 SDValue TexFail;
7830
7831 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7832 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7833 if (MaskPopVT.isVector()) {
7834 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7835 SDValue(Result, 0), ZeroIdx);
7836 } else {
7837 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7838 SDValue(Result, 0), ZeroIdx);
7839 }
7840 }
7841
7842 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7843 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7844 NumDataDwords - MaskPopDwords);
7845
7846 if (IsD16)
7847 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7848
7849 EVT LegalReqRetVT = ReqRetVT;
7850 if (!ReqRetVT.isVector()) {
7851 if (!Data.getValueType().isInteger())
7852 Data = DAG.getNode(ISD::BITCAST, DL,
7853 Data.getValueType().changeTypeToInteger(), Data);
7854 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7855 } else {
7856 // We need to widen the return vector to a legal type
7857 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7858 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7859 LegalReqRetVT =
7861 ReqRetVT.getVectorNumElements() + 1);
7862 }
7863 }
7864 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7865
7866 if (IsTexFail) {
7867 TexFail =
7868 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7869 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7870
7871 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7872 }
7873
7874 if (Result->getNumValues() == 1)
7875 return Data;
7876
7877 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7878}
7879
7880static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7881 SDValue *LWE, bool &IsTexFail) {
7882 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7883
7884 uint64_t Value = TexFailCtrlConst->getZExtValue();
7885 if (Value) {
7886 IsTexFail = true;
7887 }
7888
7889 SDLoc DL(TexFailCtrlConst);
7890 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7891 Value &= ~(uint64_t)0x1;
7892 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7893 Value &= ~(uint64_t)0x2;
7894
7895 return Value == 0;
7896}
7897
7899 MVT PackVectorVT,
7900 SmallVectorImpl<SDValue> &PackedAddrs,
7901 unsigned DimIdx, unsigned EndIdx,
7902 unsigned NumGradients) {
7903 SDLoc DL(Op);
7904 for (unsigned I = DimIdx; I < EndIdx; I++) {
7905 SDValue Addr = Op.getOperand(I);
7906
7907 // Gradients are packed with undef for each coordinate.
7908 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7909 // 1D: undef,dx/dh; undef,dx/dv
7910 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7911 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7912 if (((I + 1) >= EndIdx) ||
7913 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7914 I == DimIdx + NumGradients - 1))) {
7915 if (Addr.getValueType() != MVT::i16)
7916 Addr = DAG.getBitcast(MVT::i16, Addr);
7917 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7918 } else {
7919 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7920 I++;
7921 }
7922 Addr = DAG.getBitcast(MVT::f32, Addr);
7923 PackedAddrs.push_back(Addr);
7924 }
7925}
7926
7927SDValue SITargetLowering::lowerImage(SDValue Op,
7929 SelectionDAG &DAG, bool WithChain) const {
7930 SDLoc DL(Op);
7932 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7933 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7935 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7936 unsigned IntrOpcode = Intr->BaseOpcode;
7937 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7938 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7939 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7940
7941 SmallVector<EVT, 3> ResultTypes(Op->values());
7942 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7943 bool IsD16 = false;
7944 bool IsG16 = false;
7945 bool IsA16 = false;
7946 SDValue VData;
7947 int NumVDataDwords = 0;
7948 bool AdjustRetType = false;
7949 bool IsAtomicPacked16Bit = false;
7950
7951 // Offset of intrinsic arguments
7952 const unsigned ArgOffset = WithChain ? 2 : 1;
7953
7954 unsigned DMask;
7955 unsigned DMaskLanes = 0;
7956
7957 if (BaseOpcode->Atomic) {
7958 VData = Op.getOperand(2);
7959
7960 IsAtomicPacked16Bit =
7961 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7962 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7963
7964 bool Is64Bit = VData.getValueSizeInBits() == 64;
7965 if (BaseOpcode->AtomicX2) {
7966 SDValue VData2 = Op.getOperand(3);
7967 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7968 {VData, VData2});
7969 if (Is64Bit)
7970 VData = DAG.getBitcast(MVT::v4i32, VData);
7971
7972 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7973 DMask = Is64Bit ? 0xf : 0x3;
7974 NumVDataDwords = Is64Bit ? 4 : 2;
7975 } else {
7976 DMask = Is64Bit ? 0x3 : 0x1;
7977 NumVDataDwords = Is64Bit ? 2 : 1;
7978 }
7979 } else {
7980 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7981 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7982
7983 if (BaseOpcode->Store) {
7984 VData = Op.getOperand(2);
7985
7986 MVT StoreVT = VData.getSimpleValueType();
7987 if (StoreVT.getScalarType() == MVT::f16) {
7988 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7989 return Op; // D16 is unsupported for this instruction
7990
7991 IsD16 = true;
7992 VData = handleD16VData(VData, DAG, true);
7993 }
7994
7995 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7996 } else if (!BaseOpcode->NoReturn) {
7997 // Work out the num dwords based on the dmask popcount and underlying type
7998 // and whether packing is supported.
7999 MVT LoadVT = ResultTypes[0].getSimpleVT();
8000 if (LoadVT.getScalarType() == MVT::f16) {
8001 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
8002 return Op; // D16 is unsupported for this instruction
8003
8004 IsD16 = true;
8005 }
8006
8007 // Confirm that the return type is large enough for the dmask specified
8008 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
8009 (!LoadVT.isVector() && DMaskLanes > 1))
8010 return Op;
8011
8012 // The sq block of gfx8 and gfx9 do not estimate register use correctly
8013 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
8014 // instructions.
8015 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
8016 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
8017 NumVDataDwords = (DMaskLanes + 1) / 2;
8018 else
8019 NumVDataDwords = DMaskLanes;
8020
8021 AdjustRetType = true;
8022 }
8023 }
8024
8025 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
8027
8028 // Check for 16 bit addresses or derivatives and pack if true.
8029 MVT VAddrVT =
8030 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
8031 MVT VAddrScalarVT = VAddrVT.getScalarType();
8032 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8033 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8034
8035 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
8036 VAddrScalarVT = VAddrVT.getScalarType();
8037 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
8038 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
8039
8040 // Push back extra arguments.
8041 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
8042 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
8043 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
8044 // Special handling of bias when A16 is on. Bias is of type half but
8045 // occupies full 32-bit.
8046 SDValue Bias = DAG.getBuildVector(
8047 MVT::v2f16, DL,
8048 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
8049 VAddrs.push_back(Bias);
8050 } else {
8051 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
8052 "Bias needs to be converted to 16 bit in A16 mode");
8053 VAddrs.push_back(Op.getOperand(ArgOffset + I));
8054 }
8055 }
8056
8057 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
8058 // 16 bit gradients are supported, but are tied to the A16 control
8059 // so both gradients and addresses must be 16 bit
8060 LLVM_DEBUG(
8061 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
8062 "require 16 bit args for both gradients and addresses");
8063 return Op;
8064 }
8065
8066 if (IsA16) {
8067 if (!ST->hasA16()) {
8068 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
8069 "support 16 bit addresses\n");
8070 return Op;
8071 }
8072 }
8073
8074 // We've dealt with incorrect input so we know that if IsA16, IsG16
8075 // are set then we have to compress/pack operands (either address,
8076 // gradient or both)
8077 // In the case where a16 and gradients are tied (no G16 support) then we
8078 // have already verified that both IsA16 and IsG16 are true
8079 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
8080 // Activate g16
8081 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
8083 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
8084 }
8085
8086 // Add gradients (packed or unpacked)
8087 if (IsG16) {
8088 // Pack the gradients
8089 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
8090 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
8091 ArgOffset + Intr->GradientStart,
8092 ArgOffset + Intr->CoordStart, Intr->NumGradients);
8093 } else {
8094 for (unsigned I = ArgOffset + Intr->GradientStart;
8095 I < ArgOffset + Intr->CoordStart; I++)
8096 VAddrs.push_back(Op.getOperand(I));
8097 }
8098
8099 // Add addresses (packed or unpacked)
8100 if (IsA16) {
8101 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
8102 ArgOffset + Intr->CoordStart, VAddrEnd,
8103 0 /* No gradients */);
8104 } else {
8105 // Add uncompressed address
8106 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
8107 VAddrs.push_back(Op.getOperand(I));
8108 }
8109
8110 // If the register allocator cannot place the address registers contiguously
8111 // without introducing moves, then using the non-sequential address encoding
8112 // is always preferable, since it saves VALU instructions and is usually a
8113 // wash in terms of code size or even better.
8114 //
8115 // However, we currently have no way of hinting to the register allocator that
8116 // MIMG addresses should be placed contiguously when it is possible to do so,
8117 // so force non-NSA for the common 2-address case as a heuristic.
8118 //
8119 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
8120 // allocation when possible.
8121 //
8122 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
8123 // set of the remaining addresses.
8124 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
8125 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
8126 const bool UseNSA = ST->hasNSAEncoding() &&
8127 VAddrs.size() >= ST->getNSAThreshold(MF) &&
8128 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
8129 const bool UsePartialNSA =
8130 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
8131
8132 SDValue VAddr;
8133 if (UsePartialNSA) {
8134 VAddr = getBuildDwordsVector(DAG, DL,
8135 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
8136 }
8137 else if (!UseNSA) {
8138 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
8139 }
8140
8141 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
8142 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
8143 SDValue Unorm;
8144 if (!BaseOpcode->Sampler) {
8145 Unorm = True;
8146 } else {
8147 uint64_t UnormConst =
8148 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
8149
8150 Unorm = UnormConst ? True : False;
8151 }
8152
8153 SDValue TFE;
8154 SDValue LWE;
8155 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
8156 bool IsTexFail = false;
8157 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
8158 return Op;
8159
8160 if (IsTexFail) {
8161 if (!DMaskLanes) {
8162 // Expecting to get an error flag since TFC is on - and dmask is 0
8163 // Force dmask to be at least 1 otherwise the instruction will fail
8164 DMask = 0x1;
8165 DMaskLanes = 1;
8166 NumVDataDwords = 1;
8167 }
8168 NumVDataDwords += 1;
8169 AdjustRetType = true;
8170 }
8171
8172 // Has something earlier tagged that the return type needs adjusting
8173 // This happens if the instruction is a load or has set TexFailCtrl flags
8174 if (AdjustRetType) {
8175 // NumVDataDwords reflects the true number of dwords required in the return type
8176 if (DMaskLanes == 0 && !BaseOpcode->Store) {
8177 // This is a no-op load. This can be eliminated
8178 SDValue Undef = DAG.getUNDEF(Op.getValueType());
8179 if (isa<MemSDNode>(Op))
8180 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
8181 return Undef;
8182 }
8183
8184 EVT NewVT = NumVDataDwords > 1 ?
8185 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
8186 : MVT::i32;
8187
8188 ResultTypes[0] = NewVT;
8189 if (ResultTypes.size() == 3) {
8190 // Original result was aggregate type used for TexFailCtrl results
8191 // The actual instruction returns as a vector type which has now been
8192 // created. Remove the aggregate result.
8193 ResultTypes.erase(&ResultTypes[1]);
8194 }
8195 }
8196
8197 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
8198 if (BaseOpcode->Atomic)
8199 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
8200 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
8202 return Op;
8203
8205 if (BaseOpcode->Store || BaseOpcode->Atomic)
8206 Ops.push_back(VData); // vdata
8207 if (UsePartialNSA) {
8208 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
8209 Ops.push_back(VAddr);
8210 }
8211 else if (UseNSA)
8212 append_range(Ops, VAddrs);
8213 else
8214 Ops.push_back(VAddr);
8215 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
8216 EVT RsrcVT = Rsrc.getValueType();
8217 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
8218 return Op;
8219 Ops.push_back(Rsrc);
8220 if (BaseOpcode->Sampler) {
8221 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
8222 if (Samp.getValueType() != MVT::v4i32)
8223 return Op;
8224 Ops.push_back(Samp);
8225 }
8226 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
8227 if (IsGFX10Plus)
8228 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
8229 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8230 Ops.push_back(Unorm);
8231 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
8232 Ops.push_back(IsA16 && // r128, a16 for gfx9
8233 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8234 if (IsGFX10Plus)
8235 Ops.push_back(IsA16 ? True : False);
8236 if (!Subtarget->hasGFX90AInsts()) {
8237 Ops.push_back(TFE); //tfe
8238 } else if (TFE->getAsZExtVal()) {
8239 report_fatal_error("TFE is not supported on this GPU");
8240 }
8241 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8242 Ops.push_back(LWE); // lwe
8243 if (!IsGFX10Plus)
8244 Ops.push_back(DimInfo->DA ? True : False);
8245 if (BaseOpcode->HasD16)
8246 Ops.push_back(IsD16 ? True : False);
8247 if (isa<MemSDNode>(Op))
8248 Ops.push_back(Op.getOperand(0)); // chain
8249
8250 int NumVAddrDwords =
8251 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8252 int Opcode = -1;
8253
8254 if (IsGFX12Plus) {
8255 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8256 NumVDataDwords, NumVAddrDwords);
8257 } else if (IsGFX11Plus) {
8258 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8259 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8260 : AMDGPU::MIMGEncGfx11Default,
8261 NumVDataDwords, NumVAddrDwords);
8262 } else if (IsGFX10Plus) {
8263 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8264 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8265 : AMDGPU::MIMGEncGfx10Default,
8266 NumVDataDwords, NumVAddrDwords);
8267 } else {
8268 if (Subtarget->hasGFX90AInsts()) {
8269 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8270 NumVDataDwords, NumVAddrDwords);
8271 if (Opcode == -1)
8273 "requested image instruction is not supported on this GPU");
8274 }
8275 if (Opcode == -1 &&
8277 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8278 NumVDataDwords, NumVAddrDwords);
8279 if (Opcode == -1)
8280 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8281 NumVDataDwords, NumVAddrDwords);
8282 }
8283 if (Opcode == -1)
8284 return Op;
8285
8286 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8287 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8288 MachineMemOperand *MemRef = MemOp->getMemOperand();
8289 DAG.setNodeMemRefs(NewNode, {MemRef});
8290 }
8291
8292 if (BaseOpcode->AtomicX2) {
8294 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8295 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8296 }
8297 if (BaseOpcode->NoReturn)
8298 return SDValue(NewNode, 0);
8299 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8300 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8301 NumVDataDwords, IsAtomicPacked16Bit, DL);
8302}
8303
8304SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8305 SDValue Offset, SDValue CachePolicy,
8306 SelectionDAG &DAG) const {
8308
8309 const DataLayout &DataLayout = DAG.getDataLayout();
8310 Align Alignment =
8312
8317 VT.getStoreSize(), Alignment);
8318
8319 if (!Offset->isDivergent()) {
8320 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8321
8322 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8323 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8324 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8325 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8326 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8327 SDValue BufferLoad =
8329 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8330 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8331 }
8332
8333 // Widen vec3 load to vec4.
8334 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8335 !Subtarget->hasScalarDwordx3Loads()) {
8336 EVT WidenedVT =
8338 auto WidenedOp = DAG.getMemIntrinsicNode(
8339 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8340 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8341 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8342 DAG.getVectorIdxConstant(0, DL));
8343 return Subvector;
8344 }
8345
8347 DAG.getVTList(VT), Ops, VT, MMO);
8348 }
8349
8350 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8351 // assume that the buffer is unswizzled.
8352 SDValue Ops[] = {
8353 DAG.getEntryNode(), // Chain
8354 Rsrc, // rsrc
8355 DAG.getConstant(0, DL, MVT::i32), // vindex
8356 {}, // voffset
8357 {}, // soffset
8358 {}, // offset
8359 CachePolicy, // cachepolicy
8360 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8361 };
8362 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8363 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8364 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8365 }
8366
8368 unsigned NumLoads = 1;
8369 MVT LoadVT = VT.getSimpleVT();
8370 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8371 assert((LoadVT.getScalarType() == MVT::i32 ||
8372 LoadVT.getScalarType() == MVT::f32));
8373
8374 if (NumElts == 8 || NumElts == 16) {
8375 NumLoads = NumElts / 4;
8376 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8377 }
8378
8379 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8380
8381 // Use the alignment to ensure that the required offsets will fit into the
8382 // immediate offsets.
8383 setBufferOffsets(Offset, DAG, &Ops[3],
8384 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8385
8386 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8387 for (unsigned i = 0; i < NumLoads; ++i) {
8388 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8389 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8390 LoadVT, MMO, DAG));
8391 }
8392
8393 if (NumElts == 8 || NumElts == 16)
8394 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8395
8396 return Loads[0];
8397}
8398
8399SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8400 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8401 if (!Subtarget->hasArchitectedSGPRs())
8402 return {};
8403 SDLoc SL(Op);
8404 MVT VT = MVT::i32;
8405 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8406 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8407 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8408}
8409
8410SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8411 unsigned Dim,
8412 const ArgDescriptor &Arg) const {
8413 SDLoc SL(Op);
8415 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8416 if (MaxID == 0)
8417 return DAG.getConstant(0, SL, MVT::i32);
8418
8419 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8420 SDLoc(DAG.getEntryNode()), Arg);
8421
8422 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8423 // masking operations anyway.
8424 //
8425 // TODO: We could assert the top bit is 0 for the source copy.
8426 if (Arg.isMasked())
8427 return Val;
8428
8429 // Preserve the known bits after expansion to a copy.
8431 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8432 DAG.getValueType(SmallVT));
8433}
8434
8435SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8436 SelectionDAG &DAG) const {
8438 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8439
8440 EVT VT = Op.getValueType();
8441 SDLoc DL(Op);
8442 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8443
8444 // TODO: Should this propagate fast-math-flags?
8445
8446 switch (IntrinsicID) {
8447 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8448 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8449 return emitNonHSAIntrinsicError(DAG, DL, VT);
8450 return getPreloadedValue(DAG, *MFI, VT,
8452 }
8453 case Intrinsic::amdgcn_dispatch_ptr:
8454 case Intrinsic::amdgcn_queue_ptr: {
8455 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8456 DiagnosticInfoUnsupported BadIntrin(
8457 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8458 DL.getDebugLoc());
8459 DAG.getContext()->diagnose(BadIntrin);
8460 return DAG.getUNDEF(VT);
8461 }
8462
8463 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8465 return getPreloadedValue(DAG, *MFI, VT, RegID);
8466 }
8467 case Intrinsic::amdgcn_implicitarg_ptr: {
8468 if (MFI->isEntryFunction())
8469 return getImplicitArgPtr(DAG, DL);
8470 return getPreloadedValue(DAG, *MFI, VT,
8472 }
8473 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8475 // This only makes sense to call in a kernel, so just lower to null.
8476 return DAG.getConstant(0, DL, VT);
8477 }
8478
8479 return getPreloadedValue(DAG, *MFI, VT,
8481 }
8482 case Intrinsic::amdgcn_dispatch_id: {
8483 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8484 }
8485 case Intrinsic::amdgcn_rcp:
8486 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8487 case Intrinsic::amdgcn_rsq:
8488 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8489 case Intrinsic::amdgcn_rsq_legacy:
8491 return emitRemovedIntrinsicError(DAG, DL, VT);
8492 return SDValue();
8493 case Intrinsic::amdgcn_rcp_legacy:
8495 return emitRemovedIntrinsicError(DAG, DL, VT);
8496 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8497 case Intrinsic::amdgcn_rsq_clamp: {
8499 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8500
8501 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8504
8505 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8506 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8507 DAG.getConstantFP(Max, DL, VT));
8508 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8509 DAG.getConstantFP(Min, DL, VT));
8510 }
8511 case Intrinsic::r600_read_ngroups_x:
8512 if (Subtarget->isAmdHsaOS())
8513 return emitNonHSAIntrinsicError(DAG, DL, VT);
8514
8515 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8517 false);
8518 case Intrinsic::r600_read_ngroups_y:
8519 if (Subtarget->isAmdHsaOS())
8520 return emitNonHSAIntrinsicError(DAG, DL, VT);
8521
8522 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8524 false);
8525 case Intrinsic::r600_read_ngroups_z:
8526 if (Subtarget->isAmdHsaOS())
8527 return emitNonHSAIntrinsicError(DAG, DL, VT);
8528
8529 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8531 false);
8532 case Intrinsic::r600_read_global_size_x:
8533 if (Subtarget->isAmdHsaOS())
8534 return emitNonHSAIntrinsicError(DAG, DL, VT);
8535
8536 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8538 Align(4), false);
8539 case Intrinsic::r600_read_global_size_y:
8540 if (Subtarget->isAmdHsaOS())
8541 return emitNonHSAIntrinsicError(DAG, DL, VT);
8542
8543 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8545 Align(4), false);
8546 case Intrinsic::r600_read_global_size_z:
8547 if (Subtarget->isAmdHsaOS())
8548 return emitNonHSAIntrinsicError(DAG, DL, VT);
8549
8550 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8552 Align(4), false);
8553 case Intrinsic::r600_read_local_size_x:
8554 if (Subtarget->isAmdHsaOS())
8555 return emitNonHSAIntrinsicError(DAG, DL, VT);
8556
8557 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8559 case Intrinsic::r600_read_local_size_y:
8560 if (Subtarget->isAmdHsaOS())
8561 return emitNonHSAIntrinsicError(DAG, DL, VT);
8562
8563 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8565 case Intrinsic::r600_read_local_size_z:
8566 if (Subtarget->isAmdHsaOS())
8567 return emitNonHSAIntrinsicError(DAG, DL, VT);
8568
8569 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8571 case Intrinsic::amdgcn_workgroup_id_x:
8572 return getPreloadedValue(DAG, *MFI, VT,
8574 case Intrinsic::amdgcn_workgroup_id_y:
8575 return getPreloadedValue(DAG, *MFI, VT,
8577 case Intrinsic::amdgcn_workgroup_id_z:
8578 return getPreloadedValue(DAG, *MFI, VT,
8580 case Intrinsic::amdgcn_wave_id:
8581 return lowerWaveID(DAG, Op);
8582 case Intrinsic::amdgcn_lds_kernel_id: {
8583 if (MFI->isEntryFunction())
8584 return getLDSKernelId(DAG, DL);
8585 return getPreloadedValue(DAG, *MFI, VT,
8587 }
8588 case Intrinsic::amdgcn_workitem_id_x:
8589 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8590 case Intrinsic::amdgcn_workitem_id_y:
8591 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8592 case Intrinsic::amdgcn_workitem_id_z:
8593 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8594 case Intrinsic::amdgcn_wavefrontsize:
8596 SDLoc(Op), MVT::i32);
8597 case Intrinsic::amdgcn_s_buffer_load: {
8598 unsigned CPol = Op.getConstantOperandVal(3);
8599 // s_buffer_load, because of how it's optimized, can't be volatile
8600 // so reject ones with the volatile bit set.
8601 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8604 return Op;
8605 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8606 DAG);
8607 }
8608 case Intrinsic::amdgcn_fdiv_fast:
8609 return lowerFDIV_FAST(Op, DAG);
8610 case Intrinsic::amdgcn_sin:
8611 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8612
8613 case Intrinsic::amdgcn_cos:
8614 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8615
8616 case Intrinsic::amdgcn_mul_u24:
8617 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8618 case Intrinsic::amdgcn_mul_i24:
8619 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8620
8621 case Intrinsic::amdgcn_log_clamp: {
8623 return SDValue();
8624
8625 return emitRemovedIntrinsicError(DAG, DL, VT);
8626 }
8627 case Intrinsic::amdgcn_fract:
8628 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8629
8630 case Intrinsic::amdgcn_class:
8631 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8632 Op.getOperand(1), Op.getOperand(2));
8633 case Intrinsic::amdgcn_div_fmas:
8634 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8635 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8636 Op.getOperand(4));
8637
8638 case Intrinsic::amdgcn_div_fixup:
8639 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8640 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8641
8642 case Intrinsic::amdgcn_div_scale: {
8643 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8644
8645 // Translate to the operands expected by the machine instruction. The
8646 // first parameter must be the same as the first instruction.
8647 SDValue Numerator = Op.getOperand(1);
8648 SDValue Denominator = Op.getOperand(2);
8649
8650 // Note this order is opposite of the machine instruction's operations,
8651 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8652 // intrinsic has the numerator as the first operand to match a normal
8653 // division operation.
8654
8655 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8656
8657 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8658 Denominator, Numerator);
8659 }
8660 case Intrinsic::amdgcn_icmp: {
8661 // There is a Pat that handles this variant, so return it as-is.
8662 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8663 Op.getConstantOperandVal(2) == 0 &&
8664 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8665 return Op;
8666 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8667 }
8668 case Intrinsic::amdgcn_fcmp: {
8669 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8670 }
8671 case Intrinsic::amdgcn_ballot:
8672 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8673 case Intrinsic::amdgcn_fmed3:
8674 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8675 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8676 case Intrinsic::amdgcn_fdot2:
8677 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8678 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8679 Op.getOperand(4));
8680 case Intrinsic::amdgcn_fmul_legacy:
8681 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8682 Op.getOperand(1), Op.getOperand(2));
8683 case Intrinsic::amdgcn_sffbh:
8684 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8685 case Intrinsic::amdgcn_sbfe:
8686 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8687 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8688 case Intrinsic::amdgcn_ubfe:
8689 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8690 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8691 case Intrinsic::amdgcn_cvt_pkrtz:
8692 case Intrinsic::amdgcn_cvt_pknorm_i16:
8693 case Intrinsic::amdgcn_cvt_pknorm_u16:
8694 case Intrinsic::amdgcn_cvt_pk_i16:
8695 case Intrinsic::amdgcn_cvt_pk_u16: {
8696 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8697 EVT VT = Op.getValueType();
8698 unsigned Opcode;
8699
8700 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8702 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8704 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8706 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8708 else
8710
8711 if (isTypeLegal(VT))
8712 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8713
8714 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8715 Op.getOperand(1), Op.getOperand(2));
8716 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8717 }
8718 case Intrinsic::amdgcn_fmad_ftz:
8719 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8720 Op.getOperand(2), Op.getOperand(3));
8721
8722 case Intrinsic::amdgcn_if_break:
8723 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8724 Op->getOperand(1), Op->getOperand(2)), 0);
8725
8726 case Intrinsic::amdgcn_groupstaticsize: {
8728 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8729 return Op;
8730
8731 const Module *M = MF.getFunction().getParent();
8732 const GlobalValue *GV =
8733 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8734 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8736 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8737 }
8738 case Intrinsic::amdgcn_is_shared:
8739 case Intrinsic::amdgcn_is_private: {
8740 SDLoc SL(Op);
8741 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8743 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8744 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8745 Op.getOperand(1));
8746
8747 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8748 DAG.getConstant(1, SL, MVT::i32));
8749 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8750 }
8751 case Intrinsic::amdgcn_perm:
8752 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8753 Op.getOperand(2), Op.getOperand(3));
8754 case Intrinsic::amdgcn_reloc_constant: {
8755 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8756 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8757 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8758 auto RelocSymbol = cast<GlobalVariable>(
8759 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8760 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8762 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8763 }
8764 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8765 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8766 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8767 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8768 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8769 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8770 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8771 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8772 if (Op.getOperand(4).getValueType() == MVT::i32)
8773 return SDValue();
8774
8775 SDLoc SL(Op);
8776 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8777 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8778 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8779 Op.getOperand(3), IndexKeyi32);
8780 }
8781 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8782 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8783 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8784 if (Op.getOperand(6).getValueType() == MVT::i32)
8785 return SDValue();
8786
8787 SDLoc SL(Op);
8788 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8789 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8790 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8791 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8792 IndexKeyi32, Op.getOperand(7)});
8793 }
8794 case Intrinsic::amdgcn_addrspacecast_nonnull:
8795 return lowerADDRSPACECAST(Op, DAG);
8796 case Intrinsic::amdgcn_readlane:
8797 case Intrinsic::amdgcn_readfirstlane:
8798 case Intrinsic::amdgcn_writelane:
8799 case Intrinsic::amdgcn_permlane16:
8800 case Intrinsic::amdgcn_permlanex16:
8801 case Intrinsic::amdgcn_permlane64:
8802 case Intrinsic::amdgcn_set_inactive:
8803 case Intrinsic::amdgcn_set_inactive_chain_arg:
8804 return lowerLaneOp(*this, Op.getNode(), DAG);
8805 default:
8806 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8808 return lowerImage(Op, ImageDimIntr, DAG, false);
8809
8810 return Op;
8811 }
8812}
8813
8814// On targets not supporting constant in soffset field, turn zero to
8815// SGPR_NULL to avoid generating an extra s_mov with zero.
8817 const GCNSubtarget *Subtarget) {
8818 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8819 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8820 return SOffset;
8821}
8822
8823SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8824 SelectionDAG &DAG,
8825 unsigned NewOpcode) const {
8826 SDLoc DL(Op);
8827
8828 SDValue VData = Op.getOperand(2);
8829 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8830 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8831 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8832 SDValue Ops[] = {
8833 Op.getOperand(0), // Chain
8834 VData, // vdata
8835 Rsrc, // rsrc
8836 DAG.getConstant(0, DL, MVT::i32), // vindex
8837 Offsets.first, // voffset
8838 SOffset, // soffset
8839 Offsets.second, // offset
8840 Op.getOperand(6), // cachepolicy
8841 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8842 };
8843
8844 auto *M = cast<MemSDNode>(Op);
8845
8846 EVT MemVT = VData.getValueType();
8847 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8848 M->getMemOperand());
8849}
8850
8851SDValue
8852SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8853 unsigned NewOpcode) const {
8854 SDLoc DL(Op);
8855
8856 SDValue VData = Op.getOperand(2);
8857 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8858 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8859 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8860 SDValue Ops[] = {
8861 Op.getOperand(0), // Chain
8862 VData, // vdata
8863 Rsrc, // rsrc
8864 Op.getOperand(4), // vindex
8865 Offsets.first, // voffset
8866 SOffset, // soffset
8867 Offsets.second, // offset
8868 Op.getOperand(7), // cachepolicy
8869 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8870 };
8871
8872 auto *M = cast<MemSDNode>(Op);
8873
8874 EVT MemVT = VData.getValueType();
8875 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8876 M->getMemOperand());
8877}
8878
8879SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8880 SelectionDAG &DAG) const {
8881 unsigned IntrID = Op.getConstantOperandVal(1);
8882 SDLoc DL(Op);
8883
8884 switch (IntrID) {
8885 case Intrinsic::amdgcn_ds_ordered_add:
8886 case Intrinsic::amdgcn_ds_ordered_swap: {
8888 SDValue Chain = M->getOperand(0);
8889 SDValue M0 = M->getOperand(2);
8890 SDValue Value = M->getOperand(3);
8891 unsigned IndexOperand = M->getConstantOperandVal(7);
8892 unsigned WaveRelease = M->getConstantOperandVal(8);
8893 unsigned WaveDone = M->getConstantOperandVal(9);
8894
8895 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8896 IndexOperand &= ~0x3f;
8897 unsigned CountDw = 0;
8898
8899 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8900 CountDw = (IndexOperand >> 24) & 0xf;
8901 IndexOperand &= ~(0xf << 24);
8902
8903 if (CountDw < 1 || CountDw > 4) {
8905 "ds_ordered_count: dword count must be between 1 and 4");
8906 }
8907 }
8908
8909 if (IndexOperand)
8910 report_fatal_error("ds_ordered_count: bad index operand");
8911
8912 if (WaveDone && !WaveRelease)
8913 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8914
8915 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8916 unsigned ShaderType =
8918 unsigned Offset0 = OrderedCountIndex << 2;
8919 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8920
8921 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8922 Offset1 |= (CountDw - 1) << 6;
8923
8924 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8925 Offset1 |= ShaderType << 2;
8926
8927 unsigned Offset = Offset0 | (Offset1 << 8);
8928
8929 SDValue Ops[] = {
8930 Chain,
8931 Value,
8932 DAG.getTargetConstant(Offset, DL, MVT::i16),
8933 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8934 };
8936 M->getVTList(), Ops, M->getMemoryVT(),
8937 M->getMemOperand());
8938 }
8939 case Intrinsic::amdgcn_raw_buffer_load:
8940 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8941 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8942 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8943 case Intrinsic::amdgcn_raw_buffer_load_format:
8944 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8945 const bool IsFormat =
8946 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8947 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8948
8949 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8950 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8951 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8952 SDValue Ops[] = {
8953 Op.getOperand(0), // Chain
8954 Rsrc, // rsrc
8955 DAG.getConstant(0, DL, MVT::i32), // vindex
8956 Offsets.first, // voffset
8957 SOffset, // soffset
8958 Offsets.second, // offset
8959 Op.getOperand(5), // cachepolicy, swizzled buffer
8960 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8961 };
8962
8963 auto *M = cast<MemSDNode>(Op);
8964 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8965 }
8966 case Intrinsic::amdgcn_struct_buffer_load:
8967 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8968 case Intrinsic::amdgcn_struct_buffer_load_format:
8969 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8970 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8971 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
8972 const bool IsFormat =
8973 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8974 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8975
8976 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8977 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8978 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8979 SDValue Ops[] = {
8980 Op.getOperand(0), // Chain
8981 Rsrc, // rsrc
8982 Op.getOperand(3), // vindex
8983 Offsets.first, // voffset
8984 SOffset, // soffset
8985 Offsets.second, // offset
8986 Op.getOperand(6), // cachepolicy, swizzled buffer
8987 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8988 };
8989
8990 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8991 }
8992 case Intrinsic::amdgcn_raw_tbuffer_load:
8993 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8995 EVT LoadVT = Op.getValueType();
8996 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8997 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8998 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8999
9000 SDValue Ops[] = {
9001 Op.getOperand(0), // Chain
9002 Rsrc, // rsrc
9003 DAG.getConstant(0, DL, MVT::i32), // vindex
9004 Offsets.first, // voffset
9005 SOffset, // soffset
9006 Offsets.second, // offset
9007 Op.getOperand(5), // format
9008 Op.getOperand(6), // cachepolicy, swizzled buffer
9009 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9010 };
9011
9012 if (LoadVT.getScalarType() == MVT::f16)
9013 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
9014 M, DAG, Ops);
9015 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9016 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9017 DAG);
9018 }
9019 case Intrinsic::amdgcn_struct_tbuffer_load:
9020 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
9022 EVT LoadVT = Op.getValueType();
9023 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9024 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9025 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9026
9027 SDValue Ops[] = {
9028 Op.getOperand(0), // Chain
9029 Rsrc, // rsrc
9030 Op.getOperand(3), // vindex
9031 Offsets.first, // voffset
9032 SOffset, // soffset
9033 Offsets.second, // offset
9034 Op.getOperand(6), // format
9035 Op.getOperand(7), // cachepolicy, swizzled buffer
9036 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9037 };
9038
9039 if (LoadVT.getScalarType() == MVT::f16)
9040 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
9041 M, DAG, Ops);
9042 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
9043 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
9044 DAG);
9045 }
9046 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
9047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
9048 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9049 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
9050 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
9051 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
9052 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
9053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
9054 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9055 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
9056 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
9057 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
9058 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
9059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
9060 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9061 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
9062 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9063 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9064 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9066 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9067 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9068 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9069 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9070 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9072 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9073 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9074 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9075 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9076 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9077 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9078 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9079 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9080 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9081 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9082 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9083 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9084 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9085 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9086 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9087 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9088 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9089 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9090 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9091 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9092 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9093 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9094 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9095 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9096 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9097 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9098 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9099 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9100 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9101 return lowerRawBufferAtomicIntrin(Op, DAG,
9103 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9104 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9105 return lowerStructBufferAtomicIntrin(Op, DAG,
9107 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9108 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9109 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9110 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9111 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9112 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9113 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9114 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9115 return lowerStructBufferAtomicIntrin(Op, DAG,
9117 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9118 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9119 return lowerStructBufferAtomicIntrin(Op, DAG,
9121 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9122 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9123 return lowerStructBufferAtomicIntrin(Op, DAG,
9125 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9126 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9127 return lowerStructBufferAtomicIntrin(Op, DAG,
9129 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9130 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9131 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9132 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9133 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9134 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9135 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9136 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9137 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9138 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9139 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9140 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9141 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9142 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9143 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9144 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9145 return lowerStructBufferAtomicIntrin(Op, DAG,
9147
9148 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9149 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9150 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9151 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9152 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9153 SDValue Ops[] = {
9154 Op.getOperand(0), // Chain
9155 Op.getOperand(2), // src
9156 Op.getOperand(3), // cmp
9157 Rsrc, // rsrc
9158 DAG.getConstant(0, DL, MVT::i32), // vindex
9159 Offsets.first, // voffset
9160 SOffset, // soffset
9161 Offsets.second, // offset
9162 Op.getOperand(7), // cachepolicy
9163 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9164 };
9165 EVT VT = Op.getValueType();
9166 auto *M = cast<MemSDNode>(Op);
9167
9169 Op->getVTList(), Ops, VT, M->getMemOperand());
9170 }
9171 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9172 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9173 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9174 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9175 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9176 SDValue Ops[] = {
9177 Op.getOperand(0), // Chain
9178 Op.getOperand(2), // src
9179 Op.getOperand(3), // cmp
9180 Rsrc, // rsrc
9181 Op.getOperand(5), // vindex
9182 Offsets.first, // voffset
9183 SOffset, // soffset
9184 Offsets.second, // offset
9185 Op.getOperand(8), // cachepolicy
9186 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9187 };
9188 EVT VT = Op.getValueType();
9189 auto *M = cast<MemSDNode>(Op);
9190
9192 Op->getVTList(), Ops, VT, M->getMemOperand());
9193 }
9194 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9196 SDValue NodePtr = M->getOperand(2);
9197 SDValue RayExtent = M->getOperand(3);
9198 SDValue RayOrigin = M->getOperand(4);
9199 SDValue RayDir = M->getOperand(5);
9200 SDValue RayInvDir = M->getOperand(6);
9201 SDValue TDescr = M->getOperand(7);
9202
9203 assert(NodePtr.getValueType() == MVT::i32 ||
9204 NodePtr.getValueType() == MVT::i64);
9205 assert(RayDir.getValueType() == MVT::v3f16 ||
9206 RayDir.getValueType() == MVT::v3f32);
9207
9208 if (!Subtarget->hasGFX10_AEncoding()) {
9209 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9210 return SDValue();
9211 }
9212
9213 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9214 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9215 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9216 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9217 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9218 const unsigned NumVDataDwords = 4;
9219 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9220 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9221 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9222 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9223 IsGFX12Plus;
9224 const unsigned BaseOpcodes[2][2] = {
9225 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9226 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9227 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9228 int Opcode;
9229 if (UseNSA) {
9230 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9231 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9232 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9233 : AMDGPU::MIMGEncGfx10NSA,
9234 NumVDataDwords, NumVAddrDwords);
9235 } else {
9236 assert(!IsGFX12Plus);
9237 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9238 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9239 : AMDGPU::MIMGEncGfx10Default,
9240 NumVDataDwords, NumVAddrDwords);
9241 }
9242 assert(Opcode != -1);
9243
9245
9246 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9248 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9249 if (Lanes[0].getValueSizeInBits() == 32) {
9250 for (unsigned I = 0; I < 3; ++I)
9251 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9252 } else {
9253 if (IsAligned) {
9254 Ops.push_back(
9255 DAG.getBitcast(MVT::i32,
9256 DAG.getBuildVector(MVT::v2f16, DL,
9257 { Lanes[0], Lanes[1] })));
9258 Ops.push_back(Lanes[2]);
9259 } else {
9260 SDValue Elt0 = Ops.pop_back_val();
9261 Ops.push_back(
9262 DAG.getBitcast(MVT::i32,
9263 DAG.getBuildVector(MVT::v2f16, DL,
9264 { Elt0, Lanes[0] })));
9265 Ops.push_back(
9266 DAG.getBitcast(MVT::i32,
9267 DAG.getBuildVector(MVT::v2f16, DL,
9268 { Lanes[1], Lanes[2] })));
9269 }
9270 }
9271 };
9272
9273 if (UseNSA && IsGFX11Plus) {
9274 Ops.push_back(NodePtr);
9275 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9276 Ops.push_back(RayOrigin);
9277 if (IsA16) {
9278 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9279 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9280 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9281 for (unsigned I = 0; I < 3; ++I) {
9282 MergedLanes.push_back(DAG.getBitcast(
9283 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9284 {DirLanes[I], InvDirLanes[I]})));
9285 }
9286 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9287 } else {
9288 Ops.push_back(RayDir);
9289 Ops.push_back(RayInvDir);
9290 }
9291 } else {
9292 if (Is64)
9293 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9294 2);
9295 else
9296 Ops.push_back(NodePtr);
9297
9298 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9299 packLanes(RayOrigin, true);
9300 packLanes(RayDir, true);
9301 packLanes(RayInvDir, false);
9302 }
9303
9304 if (!UseNSA) {
9305 // Build a single vector containing all the operands so far prepared.
9306 if (NumVAddrDwords > 12) {
9307 SDValue Undef = DAG.getUNDEF(MVT::i32);
9308 Ops.append(16 - Ops.size(), Undef);
9309 }
9310 assert(Ops.size() >= 8 && Ops.size() <= 12);
9311 SDValue MergedOps = DAG.getBuildVector(
9312 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9313 Ops.clear();
9314 Ops.push_back(MergedOps);
9315 }
9316
9317 Ops.push_back(TDescr);
9318 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9319 Ops.push_back(M->getChain());
9320
9321 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9322 MachineMemOperand *MemRef = M->getMemOperand();
9323 DAG.setNodeMemRefs(NewNode, {MemRef});
9324 return SDValue(NewNode, 0);
9325 }
9326 case Intrinsic::amdgcn_global_atomic_fmin:
9327 case Intrinsic::amdgcn_global_atomic_fmax:
9328 case Intrinsic::amdgcn_global_atomic_fmin_num:
9329 case Intrinsic::amdgcn_global_atomic_fmax_num:
9330 case Intrinsic::amdgcn_flat_atomic_fmin:
9331 case Intrinsic::amdgcn_flat_atomic_fmax:
9332 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9333 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9335 SDValue Ops[] = {
9336 M->getOperand(0), // Chain
9337 M->getOperand(2), // Ptr
9338 M->getOperand(3) // Value
9339 };
9340 unsigned Opcode = 0;
9341 switch (IntrID) {
9342 case Intrinsic::amdgcn_global_atomic_fmin:
9343 case Intrinsic::amdgcn_global_atomic_fmin_num:
9344 case Intrinsic::amdgcn_flat_atomic_fmin:
9345 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9346 Opcode = ISD::ATOMIC_LOAD_FMIN;
9347 break;
9348 }
9349 case Intrinsic::amdgcn_global_atomic_fmax:
9350 case Intrinsic::amdgcn_global_atomic_fmax_num:
9351 case Intrinsic::amdgcn_flat_atomic_fmax:
9352 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9353 Opcode = ISD::ATOMIC_LOAD_FMAX;
9354 break;
9355 }
9356 default:
9357 llvm_unreachable("unhandled atomic opcode");
9358 }
9359 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
9360 Ops, M->getMemOperand());
9361 }
9362 case Intrinsic::amdgcn_s_get_barrier_state: {
9363 SDValue Chain = Op->getOperand(0);
9365 unsigned Opc;
9366 bool IsInlinableBarID = false;
9367 int64_t BarID;
9368
9369 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9370 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9371 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9372 }
9373
9374 if (IsInlinableBarID) {
9375 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9376 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9377 Ops.push_back(K);
9378 Ops.push_back(Chain);
9379 } else {
9380 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9381 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9382 Ops.push_back(M0Val.getValue(0));
9383 }
9384
9385 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9386 return SDValue(NewMI, 0);
9387 }
9388 default:
9389
9390 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9392 return lowerImage(Op, ImageDimIntr, DAG, true);
9393
9394 return SDValue();
9395 }
9396}
9397
9398// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9399// dwordx4 if on SI and handle TFE loads.
9400SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9401 SDVTList VTList,
9402 ArrayRef<SDValue> Ops, EVT MemVT,
9403 MachineMemOperand *MMO,
9404 SelectionDAG &DAG) const {
9405 LLVMContext &C = *DAG.getContext();
9407 EVT VT = VTList.VTs[0];
9408
9409 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9410 bool IsTFE = VTList.NumVTs == 3;
9411 if (IsTFE) {
9412 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9413 unsigned NumOpDWords = NumValueDWords + 1;
9414 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9415 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9416 MachineMemOperand *OpDWordsMMO =
9417 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9418 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9419 OpDWordsVT, OpDWordsMMO, DAG);
9421 DAG.getVectorIdxConstant(NumValueDWords, DL));
9422 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9423 SDValue ValueDWords =
9424 NumValueDWords == 1
9425 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9427 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9428 ZeroIdx);
9429 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9430 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9431 }
9432
9433 if (!Subtarget->hasDwordx3LoadStores() &&
9434 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9435 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9436 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9437 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9438 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9439 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9440 WidenedMemVT, WidenedMMO);
9442 DAG.getVectorIdxConstant(0, DL));
9443 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9444 }
9445
9446 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9447}
9448
9449SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9450 bool ImageStore) const {
9451 EVT StoreVT = VData.getValueType();
9452
9453 // No change for f16 and legal vector D16 types.
9454 if (!StoreVT.isVector())
9455 return VData;
9456
9457 SDLoc DL(VData);
9458 unsigned NumElements = StoreVT.getVectorNumElements();
9459
9460 if (Subtarget->hasUnpackedD16VMem()) {
9461 // We need to unpack the packed data to store.
9462 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9463 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9464
9465 EVT EquivStoreVT =
9466 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9467 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9468 return DAG.UnrollVectorOp(ZExt.getNode());
9469 }
9470
9471 // The sq block of gfx8.1 does not estimate register use correctly for d16
9472 // image store instructions. The data operand is computed as if it were not a
9473 // d16 image instruction.
9474 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9475 // Bitcast to i16
9476 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9477 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9478
9479 // Decompose into scalars
9481 DAG.ExtractVectorElements(IntVData, Elts);
9482
9483 // Group pairs of i16 into v2i16 and bitcast to i32
9484 SmallVector<SDValue, 4> PackedElts;
9485 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9486 SDValue Pair =
9487 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9488 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9489 PackedElts.push_back(IntPair);
9490 }
9491 if ((NumElements % 2) == 1) {
9492 // Handle v3i16
9493 unsigned I = Elts.size() / 2;
9494 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9495 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9496 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9497 PackedElts.push_back(IntPair);
9498 }
9499
9500 // Pad using UNDEF
9501 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9502
9503 // Build final vector
9504 EVT VecVT =
9505 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9506 return DAG.getBuildVector(VecVT, DL, PackedElts);
9507 }
9508
9509 if (NumElements == 3) {
9510 EVT IntStoreVT =
9512 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9513
9514 EVT WidenedStoreVT = EVT::getVectorVT(
9515 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9516 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9517 WidenedStoreVT.getStoreSizeInBits());
9518 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9519 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9520 }
9521
9522 assert(isTypeLegal(StoreVT));
9523 return VData;
9524}
9525
9526SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9527 SelectionDAG &DAG) const {
9528 SDLoc DL(Op);
9529 SDValue Chain = Op.getOperand(0);
9530 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9532
9533 switch (IntrinsicID) {
9534 case Intrinsic::amdgcn_exp_compr: {
9535 if (!Subtarget->hasCompressedExport()) {
9536 DiagnosticInfoUnsupported BadIntrin(
9538 "intrinsic not supported on subtarget", DL.getDebugLoc());
9539 DAG.getContext()->diagnose(BadIntrin);
9540 }
9541 SDValue Src0 = Op.getOperand(4);
9542 SDValue Src1 = Op.getOperand(5);
9543 // Hack around illegal type on SI by directly selecting it.
9544 if (isTypeLegal(Src0.getValueType()))
9545 return SDValue();
9546
9547 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9548 SDValue Undef = DAG.getUNDEF(MVT::f32);
9549 const SDValue Ops[] = {
9550 Op.getOperand(2), // tgt
9551 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9552 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9553 Undef, // src2
9554 Undef, // src3
9555 Op.getOperand(7), // vm
9556 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9557 Op.getOperand(3), // en
9558 Op.getOperand(0) // Chain
9559 };
9560
9561 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9562 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9563 }
9564 case Intrinsic::amdgcn_s_barrier: {
9566 if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
9567 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9568 if (WGSize <= ST.getWavefrontSize())
9569 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9570 Op.getOperand(0)), 0);
9571 }
9572
9573 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9574 if (ST.hasSplitBarriers()) {
9575 SDValue K =
9577 SDValue BarSignal =
9578 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9579 MVT::Other, K, Op.getOperand(0)),
9580 0);
9581 SDValue BarWait =
9582 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9583 BarSignal.getValue(0)),
9584 0);
9585 return BarWait;
9586 }
9587
9588 return SDValue();
9589 };
9590
9591 case Intrinsic::amdgcn_struct_tbuffer_store:
9592 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9593 SDValue VData = Op.getOperand(2);
9594 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9595 if (IsD16)
9596 VData = handleD16VData(VData, DAG);
9597 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9598 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9599 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9600 SDValue Ops[] = {
9601 Chain,
9602 VData, // vdata
9603 Rsrc, // rsrc
9604 Op.getOperand(4), // vindex
9605 Offsets.first, // voffset
9606 SOffset, // soffset
9607 Offsets.second, // offset
9608 Op.getOperand(7), // format
9609 Op.getOperand(8), // cachepolicy, swizzled buffer
9610 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9611 };
9612 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9615 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9616 M->getMemoryVT(), M->getMemOperand());
9617 }
9618
9619 case Intrinsic::amdgcn_raw_tbuffer_store:
9620 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9621 SDValue VData = Op.getOperand(2);
9622 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9623 if (IsD16)
9624 VData = handleD16VData(VData, DAG);
9625 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9626 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9627 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9628 SDValue Ops[] = {
9629 Chain,
9630 VData, // vdata
9631 Rsrc, // rsrc
9632 DAG.getConstant(0, DL, MVT::i32), // vindex
9633 Offsets.first, // voffset
9634 SOffset, // soffset
9635 Offsets.second, // offset
9636 Op.getOperand(6), // format
9637 Op.getOperand(7), // cachepolicy, swizzled buffer
9638 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9639 };
9640 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9643 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9644 M->getMemoryVT(), M->getMemOperand());
9645 }
9646
9647 case Intrinsic::amdgcn_raw_buffer_store:
9648 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9649 case Intrinsic::amdgcn_raw_buffer_store_format:
9650 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9651 const bool IsFormat =
9652 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9653 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9654
9655 SDValue VData = Op.getOperand(2);
9656 EVT VDataVT = VData.getValueType();
9657 EVT EltType = VDataVT.getScalarType();
9658 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9659 if (IsD16) {
9660 VData = handleD16VData(VData, DAG);
9661 VDataVT = VData.getValueType();
9662 }
9663
9664 if (!isTypeLegal(VDataVT)) {
9665 VData =
9666 DAG.getNode(ISD::BITCAST, DL,
9667 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9668 }
9669
9670 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9671 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9672 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9673 SDValue Ops[] = {
9674 Chain,
9675 VData,
9676 Rsrc,
9677 DAG.getConstant(0, DL, MVT::i32), // vindex
9678 Offsets.first, // voffset
9679 SOffset, // soffset
9680 Offsets.second, // offset
9681 Op.getOperand(6), // cachepolicy, swizzled buffer
9682 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9683 };
9684 unsigned Opc =
9686 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9688
9689 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9690 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9691 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9692
9693 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9694 M->getMemoryVT(), M->getMemOperand());
9695 }
9696
9697 case Intrinsic::amdgcn_struct_buffer_store:
9698 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9699 case Intrinsic::amdgcn_struct_buffer_store_format:
9700 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9701 const bool IsFormat =
9702 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9703 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9704
9705 SDValue VData = Op.getOperand(2);
9706 EVT VDataVT = VData.getValueType();
9707 EVT EltType = VDataVT.getScalarType();
9708 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9709
9710 if (IsD16) {
9711 VData = handleD16VData(VData, DAG);
9712 VDataVT = VData.getValueType();
9713 }
9714
9715 if (!isTypeLegal(VDataVT)) {
9716 VData =
9717 DAG.getNode(ISD::BITCAST, DL,
9718 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9719 }
9720
9721 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9722 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9723 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9724 SDValue Ops[] = {
9725 Chain,
9726 VData,
9727 Rsrc,
9728 Op.getOperand(4), // vindex
9729 Offsets.first, // voffset
9730 SOffset, // soffset
9731 Offsets.second, // offset
9732 Op.getOperand(7), // cachepolicy, swizzled buffer
9733 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9734 };
9735 unsigned Opc =
9737 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9739
9740 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9741 EVT VDataType = VData.getValueType().getScalarType();
9742 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9743 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9744
9745 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9746 M->getMemoryVT(), M->getMemOperand());
9747 }
9748 case Intrinsic::amdgcn_raw_buffer_load_lds:
9749 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9750 case Intrinsic::amdgcn_struct_buffer_load_lds:
9751 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9752 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9753 unsigned Opc;
9754 bool HasVIndex =
9755 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9756 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9757 unsigned OpOffset = HasVIndex ? 1 : 0;
9758 SDValue VOffset = Op.getOperand(5 + OpOffset);
9759 bool HasVOffset = !isNullConstant(VOffset);
9760 unsigned Size = Op->getConstantOperandVal(4);
9761
9762 switch (Size) {
9763 default:
9764 return SDValue();
9765 case 1:
9766 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9767 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9768 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9769 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9770 break;
9771 case 2:
9772 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9773 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9774 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9775 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9776 break;
9777 case 4:
9778 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9779 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9780 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9781 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9782 break;
9783 }
9784
9785 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9786
9788
9789 if (HasVIndex && HasVOffset)
9790 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9791 { Op.getOperand(5), // VIndex
9792 VOffset }));
9793 else if (HasVIndex)
9794 Ops.push_back(Op.getOperand(5));
9795 else if (HasVOffset)
9796 Ops.push_back(VOffset);
9797
9798 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9799 Ops.push_back(Rsrc);
9800 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9801 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9802 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9803 Ops.push_back(
9804 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9806 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9807 Ops.push_back(M0Val.getValue(0)); // Chain
9808 Ops.push_back(M0Val.getValue(1)); // Glue
9809
9810 auto *M = cast<MemSDNode>(Op);
9811 MachineMemOperand *LoadMMO = M->getMemOperand();
9812 // Don't set the offset value here because the pointer points to the base of
9813 // the buffer.
9814 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9815
9816 MachinePointerInfo StorePtrI = LoadPtrI;
9817 LoadPtrI.V = PoisonValue::get(
9821
9822 auto F = LoadMMO->getFlags() &
9824 LoadMMO =
9826 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9827
9829 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9830 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9831
9832 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9833 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9834
9835 return SDValue(Load, 0);
9836 }
9837 case Intrinsic::amdgcn_global_load_lds: {
9838 unsigned Opc;
9839 unsigned Size = Op->getConstantOperandVal(4);
9840 switch (Size) {
9841 default:
9842 return SDValue();
9843 case 1:
9844 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9845 break;
9846 case 2:
9847 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9848 break;
9849 case 4:
9850 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9851 break;
9852 }
9853
9854 auto *M = cast<MemSDNode>(Op);
9855 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9856
9858
9859 SDValue Addr = Op.getOperand(2); // Global ptr
9860 SDValue VOffset;
9861 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9862 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9863 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9864 SDValue LHS = Addr.getOperand(0);
9865 SDValue RHS = Addr.getOperand(1);
9866
9867 if (LHS->isDivergent())
9868 std::swap(LHS, RHS);
9869
9870 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9871 RHS.getOperand(0).getValueType() == MVT::i32) {
9872 // add (i64 sgpr), (zero_extend (i32 vgpr))
9873 Addr = LHS;
9874 VOffset = RHS.getOperand(0);
9875 }
9876 }
9877
9878 Ops.push_back(Addr);
9879 if (!Addr->isDivergent()) {
9880 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9881 if (!VOffset)
9882 VOffset = SDValue(
9883 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9884 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9885 Ops.push_back(VOffset);
9886 }
9887
9888 Ops.push_back(Op.getOperand(5)); // Offset
9889 Ops.push_back(Op.getOperand(6)); // CPol
9890 Ops.push_back(M0Val.getValue(0)); // Chain
9891 Ops.push_back(M0Val.getValue(1)); // Glue
9892
9893 MachineMemOperand *LoadMMO = M->getMemOperand();
9894 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9895 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9896 MachinePointerInfo StorePtrI = LoadPtrI;
9897 LoadPtrI.V = PoisonValue::get(
9901 auto F = LoadMMO->getFlags() &
9903 LoadMMO =
9905 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9907 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9908 LoadMMO->getAAInfo());
9909
9910 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9911 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9912
9913 return SDValue(Load, 0);
9914 }
9915 case Intrinsic::amdgcn_end_cf:
9916 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9917 Op->getOperand(2), Chain), 0);
9918 case Intrinsic::amdgcn_s_barrier_init:
9919 case Intrinsic::amdgcn_s_barrier_join:
9920 case Intrinsic::amdgcn_s_wakeup_barrier: {
9921 SDValue Chain = Op->getOperand(0);
9923 SDValue BarOp = Op->getOperand(2);
9924 unsigned Opc;
9925 bool IsInlinableBarID = false;
9926 int64_t BarVal;
9927
9928 if (isa<ConstantSDNode>(BarOp)) {
9929 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9930 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9931 }
9932
9933 if (IsInlinableBarID) {
9934 switch (IntrinsicID) {
9935 default:
9936 return SDValue();
9937 case Intrinsic::amdgcn_s_barrier_init:
9938 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9939 break;
9940 case Intrinsic::amdgcn_s_barrier_join:
9941 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9942 break;
9943 case Intrinsic::amdgcn_s_wakeup_barrier:
9944 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9945 break;
9946 }
9947
9948 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9949 Ops.push_back(K);
9950 } else {
9951 switch (IntrinsicID) {
9952 default:
9953 return SDValue();
9954 case Intrinsic::amdgcn_s_barrier_init:
9955 Opc = AMDGPU::S_BARRIER_INIT_M0;
9956 break;
9957 case Intrinsic::amdgcn_s_barrier_join:
9958 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9959 break;
9960 case Intrinsic::amdgcn_s_wakeup_barrier:
9961 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9962 break;
9963 }
9964 }
9965
9966 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9967 SDValue M0Val;
9968 // Member count will be read from M0[16:22]
9969 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9970 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9971
9972 if (!IsInlinableBarID) {
9973 // If reference to barrier id is not an inline constant then it must be
9974 // referenced with M0[4:0]. Perform an OR with the member count to
9975 // include it in M0.
9976 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
9977 Op.getOperand(2), M0Val),
9978 0);
9979 }
9980 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
9981 } else if (IsInlinableBarID) {
9982 Ops.push_back(Chain);
9983 } else {
9984 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
9985 }
9986
9987 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9988 return SDValue(NewMI, 0);
9989 }
9990 case Intrinsic::amdgcn_s_prefetch_data: {
9991 // For non-global address space preserve the chain and remove the call.
9993 return Op.getOperand(0);
9994 return Op;
9995 }
9996 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
9997 SDValue Ops[] = {
9998 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
9999 Op.getOperand(3), // offset
10000 Op.getOperand(4), // length
10001 };
10002
10005 Op->getVTList(), Ops, M->getMemoryVT(),
10006 M->getMemOperand());
10007 }
10008 default: {
10009 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10011 return lowerImage(Op, ImageDimIntr, DAG, true);
10012
10013 return Op;
10014 }
10015 }
10016}
10017
10018// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10019// offset (the offset that is included in bounds checking and swizzling, to be
10020// split between the instruction's voffset and immoffset fields) and soffset
10021// (the offset that is excluded from bounds checking and swizzling, to go in
10022// the instruction's soffset field). This function takes the first kind of
10023// offset and figures out how to split it between voffset and immoffset.
10024std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
10025 SDValue Offset, SelectionDAG &DAG) const {
10026 SDLoc DL(Offset);
10027 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10028 SDValue N0 = Offset;
10029 ConstantSDNode *C1 = nullptr;
10030
10031 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10032 N0 = SDValue();
10033 else if (DAG.isBaseWithConstantOffset(N0)) {
10034 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10035 N0 = N0.getOperand(0);
10036 }
10037
10038 if (C1) {
10039 unsigned ImmOffset = C1->getZExtValue();
10040 // If the immediate value is too big for the immoffset field, put only bits
10041 // that would normally fit in the immoffset field. The remaining value that
10042 // is copied/added for the voffset field is a large power of 2, and it
10043 // stands more chance of being CSEd with the copy/add for another similar
10044 // load/store.
10045 // However, do not do that rounding down if that is a negative
10046 // number, as it appears to be illegal to have a negative offset in the
10047 // vgpr, even if adding the immediate offset makes it positive.
10048 unsigned Overflow = ImmOffset & ~MaxImm;
10049 ImmOffset -= Overflow;
10050 if ((int32_t)Overflow < 0) {
10051 Overflow += ImmOffset;
10052 ImmOffset = 0;
10053 }
10054 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10055 if (Overflow) {
10056 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10057 if (!N0)
10058 N0 = OverflowVal;
10059 else {
10060 SDValue Ops[] = { N0, OverflowVal };
10061 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10062 }
10063 }
10064 }
10065 if (!N0)
10066 N0 = DAG.getConstant(0, DL, MVT::i32);
10067 if (!C1)
10068 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10069 return {N0, SDValue(C1, 0)};
10070}
10071
10072// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
10073// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10074// pointed to by Offsets.
10075void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10076 SelectionDAG &DAG, SDValue *Offsets,
10077 Align Alignment) const {
10079 SDLoc DL(CombinedOffset);
10080 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10081 uint32_t Imm = C->getZExtValue();
10082 uint32_t SOffset, ImmOffset;
10083 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10084 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10085 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10086 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10087 return;
10088 }
10089 }
10090 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10091 SDValue N0 = CombinedOffset.getOperand(0);
10092 SDValue N1 = CombinedOffset.getOperand(1);
10093 uint32_t SOffset, ImmOffset;
10094 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10095 if (Offset >= 0 &&
10096 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10097 Offsets[0] = N0;
10098 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10099 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10100 return;
10101 }
10102 }
10103
10104 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10105 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10106 : DAG.getConstant(0, DL, MVT::i32);
10107
10108 Offsets[0] = CombinedOffset;
10109 Offsets[1] = SOffsetZero;
10110 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10111}
10112
10113SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10114 SelectionDAG &DAG) const {
10115 if (!MaybePointer.getValueType().isScalarInteger())
10116 return MaybePointer;
10117
10118 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10119 return Rsrc;
10120}
10121
10122// Wrap a global or flat pointer into a buffer intrinsic using the flags
10123// specified in the intrinsic.
10124SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10125 SelectionDAG &DAG) const {
10126 SDLoc Loc(Op);
10127
10128 SDValue Pointer = Op->getOperand(1);
10129 SDValue Stride = Op->getOperand(2);
10130 SDValue NumRecords = Op->getOperand(3);
10131 SDValue Flags = Op->getOperand(4);
10132
10133 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10134 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10135 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10136 std::optional<uint32_t> ConstStride = std::nullopt;
10137 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10138 ConstStride = ConstNode->getZExtValue();
10139
10140 SDValue NewHighHalf = Masked;
10141 if (!ConstStride || *ConstStride != 0) {
10142 SDValue ShiftedStride;
10143 if (ConstStride) {
10144 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10145 } else {
10146 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10147 ShiftedStride =
10148 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10149 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10150 }
10151 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10152 }
10153
10154 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10155 NewHighHalf, NumRecords, Flags);
10156 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10157 return RsrcPtr;
10158}
10159
10160// Handle 8 bit and 16 bit buffer loads
10161SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
10162 EVT LoadVT, SDLoc DL,
10164 MachineMemOperand *MMO,
10165 bool IsTFE) const {
10166 EVT IntVT = LoadVT.changeTypeToInteger();
10167
10168 if (IsTFE) {
10169 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
10173 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
10174 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
10175 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
10177 DAG.getConstant(1, DL, MVT::i32));
10178 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
10179 DAG.getConstant(0, DL, MVT::i32));
10180 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
10181 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
10182 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
10183 }
10184
10185 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10187
10188 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10189 SDValue BufferLoad =
10190 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10191 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10192 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10193
10194 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10195}
10196
10197// Handle 8 bit and 16 bit buffer stores
10198SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10199 EVT VDataType, SDLoc DL,
10200 SDValue Ops[],
10201 MemSDNode *M) const {
10202 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
10203 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10204
10205 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10206 Ops[1] = BufferStoreExt;
10207 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10208 AMDGPUISD::BUFFER_STORE_SHORT;
10209 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10210 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10211 M->getMemOperand());
10212}
10213
10215 ISD::LoadExtType ExtType, SDValue Op,
10216 const SDLoc &SL, EVT VT) {
10217 if (VT.bitsLT(Op.getValueType()))
10218 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10219
10220 switch (ExtType) {
10221 case ISD::SEXTLOAD:
10222 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10223 case ISD::ZEXTLOAD:
10224 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10225 case ISD::EXTLOAD:
10226 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10227 case ISD::NON_EXTLOAD:
10228 return Op;
10229 }
10230
10231 llvm_unreachable("invalid ext type");
10232}
10233
10234// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10235// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10236SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10237 SelectionDAG &DAG = DCI.DAG;
10238 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10239 return SDValue();
10240
10241 // FIXME: Constant loads should all be marked invariant.
10242 unsigned AS = Ld->getAddressSpace();
10243 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10245 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10246 return SDValue();
10247
10248 // Don't do this early, since it may interfere with adjacent load merging for
10249 // illegal types. We can avoid losing alignment information for exotic types
10250 // pre-legalize.
10251 EVT MemVT = Ld->getMemoryVT();
10252 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10253 MemVT.getSizeInBits() >= 32)
10254 return SDValue();
10255
10256 SDLoc SL(Ld);
10257
10258 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10259 "unexpected vector extload");
10260
10261 // TODO: Drop only high part of range.
10262 SDValue Ptr = Ld->getBasePtr();
10263 SDValue NewLoad = DAG.getLoad(
10264 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10265 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10266 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10267 nullptr); // Drop ranges
10268
10269 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10270 if (MemVT.isFloatingPoint()) {
10272 "unexpected fp extload");
10273 TruncVT = MemVT.changeTypeToInteger();
10274 }
10275
10276 SDValue Cvt = NewLoad;
10277 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10278 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10279 DAG.getValueType(TruncVT));
10280 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10282 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10283 } else {
10285 }
10286
10287 EVT VT = Ld->getValueType(0);
10288 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10289
10290 DCI.AddToWorklist(Cvt.getNode());
10291
10292 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10293 // the appropriate extension from the 32-bit load.
10294 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10295 DCI.AddToWorklist(Cvt.getNode());
10296
10297 // Handle conversion back to floating point if necessary.
10298 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10299
10300 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10301}
10302
10304 const SIMachineFunctionInfo &Info) {
10305 // TODO: Should check if the address can definitely not access stack.
10306 if (Info.isEntryFunction())
10307 return Info.getUserSGPRInfo().hasFlatScratchInit();
10308 return true;
10309}
10310
10311SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10312 SDLoc DL(Op);
10314 ISD::LoadExtType ExtType = Load->getExtensionType();
10315 EVT MemVT = Load->getMemoryVT();
10316 MachineMemOperand *MMO = Load->getMemOperand();
10317
10318 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10319 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10320 return SDValue();
10321
10322 // FIXME: Copied from PPC
10323 // First, load into 32 bits, then truncate to 1 bit.
10324
10325 SDValue Chain = Load->getChain();
10326 SDValue BasePtr = Load->getBasePtr();
10327
10328 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10329
10330 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10331 BasePtr, RealMemVT, MMO);
10332
10333 if (!MemVT.isVector()) {
10334 SDValue Ops[] = {
10335 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10336 NewLD.getValue(1)
10337 };
10338
10339 return DAG.getMergeValues(Ops, DL);
10340 }
10341
10343 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10344 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10345 DAG.getConstant(I, DL, MVT::i32));
10346
10347 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10348 }
10349
10350 SDValue Ops[] = {
10351 DAG.getBuildVector(MemVT, DL, Elts),
10352 NewLD.getValue(1)
10353 };
10354
10355 return DAG.getMergeValues(Ops, DL);
10356 }
10357
10358 if (!MemVT.isVector())
10359 return SDValue();
10360
10361 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10362 "Custom lowering for non-i32 vectors hasn't been implemented.");
10363
10364 Align Alignment = Load->getAlign();
10365 unsigned AS = Load->getAddressSpace();
10366 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10367 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10368 return SplitVectorLoad(Op, DAG);
10369 }
10370
10373 // If there is a possibility that flat instruction access scratch memory
10374 // then we need to use the same legalization rules we use for private.
10375 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10377 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10379
10380 unsigned NumElements = MemVT.getVectorNumElements();
10381
10382 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10384 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
10385 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
10387 if ((!Op->isDivergent() || AMDGPUInstrInfo::isUniformMMO(MMO)) &&
10388 Alignment >= Align(4) && NumElements < 32) {
10389 if (MemVT.isPow2VectorType() ||
10390 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10391 return SDValue();
10392 return WidenOrSplitVectorLoad(Op, DAG);
10393 }
10394 // Non-uniform loads will be selected to MUBUF instructions, so they
10395 // have the same legalization requirements as global and private
10396 // loads.
10397 //
10398 }
10399 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10402 AS == AMDGPUAS::FLAT_ADDRESS) {
10403 if (NumElements > 4)
10404 return SplitVectorLoad(Op, DAG);
10405 // v3 loads not supported on SI.
10406 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10407 return WidenOrSplitVectorLoad(Op, DAG);
10408
10409 // v3 and v4 loads are supported for private and global memory.
10410 return SDValue();
10411 }
10412 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10413 // Depending on the setting of the private_element_size field in the
10414 // resource descriptor, we can only make private accesses up to a certain
10415 // size.
10416 switch (Subtarget->getMaxPrivateElementSize()) {
10417 case 4: {
10418 SDValue Ops[2];
10419 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10420 return DAG.getMergeValues(Ops, DL);
10421 }
10422 case 8:
10423 if (NumElements > 2)
10424 return SplitVectorLoad(Op, DAG);
10425 return SDValue();
10426 case 16:
10427 // Same as global/flat
10428 if (NumElements > 4)
10429 return SplitVectorLoad(Op, DAG);
10430 // v3 loads not supported on SI.
10431 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10432 return WidenOrSplitVectorLoad(Op, DAG);
10433
10434 return SDValue();
10435 default:
10436 llvm_unreachable("unsupported private_element_size");
10437 }
10438 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10439 unsigned Fast = 0;
10440 auto Flags = Load->getMemOperand()->getFlags();
10442 Load->getAlign(), Flags, &Fast) &&
10443 Fast > 1)
10444 return SDValue();
10445
10446 if (MemVT.isVector())
10447 return SplitVectorLoad(Op, DAG);
10448 }
10449
10451 MemVT, *Load->getMemOperand())) {
10452 SDValue Ops[2];
10453 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10454 return DAG.getMergeValues(Ops, DL);
10455 }
10456
10457 return SDValue();
10458}
10459
10460SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10461 EVT VT = Op.getValueType();
10462 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10463 VT.getSizeInBits() == 512)
10464 return splitTernaryVectorOp(Op, DAG);
10465
10466 assert(VT.getSizeInBits() == 64);
10467
10468 SDLoc DL(Op);
10469 SDValue Cond = Op.getOperand(0);
10470
10471 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10472 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10473
10474 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10475 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10476
10477 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10478 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10479
10480 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10481
10482 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10483 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10484
10485 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10486
10487 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10488 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10489}
10490
10491// Catch division cases where we can use shortcuts with rcp and rsq
10492// instructions.
10493SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10494 SelectionDAG &DAG) const {
10495 SDLoc SL(Op);
10496 SDValue LHS = Op.getOperand(0);
10497 SDValue RHS = Op.getOperand(1);
10498 EVT VT = Op.getValueType();
10499 const SDNodeFlags Flags = Op->getFlags();
10500
10501 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10503
10504 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10505 // Without !fpmath accuracy information, we can't do more because we don't
10506 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10507 // f16 is always accurate enough
10508 if (!AllowInaccurateRcp && VT != MVT::f16)
10509 return SDValue();
10510
10511 if (CLHS->isExactlyValue(1.0)) {
10512 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10513 // the CI documentation has a worst case error of 1 ulp.
10514 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10515 // use it as long as we aren't trying to use denormals.
10516 //
10517 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10518
10519 // 1.0 / sqrt(x) -> rsq(x)
10520
10521 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10522 // error seems really high at 2^29 ULP.
10523 // 1.0 / x -> rcp(x)
10524 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10525 }
10526
10527 // Same as for 1.0, but expand the sign out of the constant.
10528 if (CLHS->isExactlyValue(-1.0)) {
10529 // -1.0 / x -> rcp (fneg x)
10530 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10531 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10532 }
10533 }
10534
10535 // For f16 require afn or arcp.
10536 // For f32 require afn.
10537 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10538 return SDValue();
10539
10540 // Turn into multiply by the reciprocal.
10541 // x / y -> x * (1.0 / y)
10542 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10543 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10544}
10545
10546SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10547 SelectionDAG &DAG) const {
10548 SDLoc SL(Op);
10549 SDValue X = Op.getOperand(0);
10550 SDValue Y = Op.getOperand(1);
10551 EVT VT = Op.getValueType();
10552 const SDNodeFlags Flags = Op->getFlags();
10553
10554 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10556 if (!AllowInaccurateDiv)
10557 return SDValue();
10558
10559 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10560 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10561
10562 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10563 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10564
10565 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10566 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10567 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10568 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10569 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10570 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10571}
10572
10573static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10574 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10575 SDNodeFlags Flags) {
10576 if (GlueChain->getNumValues() <= 1) {
10577 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10578 }
10579
10580 assert(GlueChain->getNumValues() == 3);
10581
10582 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10583 switch (Opcode) {
10584 default: llvm_unreachable("no chain equivalent for opcode");
10585 case ISD::FMUL:
10586 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10587 break;
10588 }
10589
10590 return DAG.getNode(Opcode, SL, VTList,
10591 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10592 Flags);
10593}
10594
10595static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10596 EVT VT, SDValue A, SDValue B, SDValue C,
10597 SDValue GlueChain, SDNodeFlags Flags) {
10598 if (GlueChain->getNumValues() <= 1) {
10599 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10600 }
10601
10602 assert(GlueChain->getNumValues() == 3);
10603
10604 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10605 switch (Opcode) {
10606 default: llvm_unreachable("no chain equivalent for opcode");
10607 case ISD::FMA:
10608 Opcode = AMDGPUISD::FMA_W_CHAIN;
10609 break;
10610 }
10611
10612 return DAG.getNode(Opcode, SL, VTList,
10613 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10614 Flags);
10615}
10616
10617SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10618 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10619 return FastLowered;
10620
10621 SDLoc SL(Op);
10622 SDValue Src0 = Op.getOperand(0);
10623 SDValue Src1 = Op.getOperand(1);
10624
10625 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10626 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10627
10628 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10629 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10630
10631 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10632 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10633
10634 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10635}
10636
10637// Faster 2.5 ULP division that does not support denormals.
10638SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10639 SDNodeFlags Flags = Op->getFlags();
10640 SDLoc SL(Op);
10641 SDValue LHS = Op.getOperand(1);
10642 SDValue RHS = Op.getOperand(2);
10643
10644 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10645
10646 const APFloat K0Val(0x1p+96f);
10647 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10648
10649 const APFloat K1Val(0x1p-32f);
10650 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10651
10652 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10653
10654 EVT SetCCVT =
10655 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10656
10657 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10658
10659 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10660
10661 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10662
10663 // rcp does not support denormals.
10664 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10665
10666 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10667
10668 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10669}
10670
10671// Returns immediate value for setting the F32 denorm mode when using the
10672// S_DENORM_MODE instruction.
10674 const SIMachineFunctionInfo *Info,
10675 const GCNSubtarget *ST) {
10676 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10677 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10678 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10679 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10680}
10681
10682SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10683 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10684 return FastLowered;
10685
10686 // The selection matcher assumes anything with a chain selecting to a
10687 // mayRaiseFPException machine instruction. Since we're introducing a chain
10688 // here, we need to explicitly report nofpexcept for the regular fdiv
10689 // lowering.
10690 SDNodeFlags Flags = Op->getFlags();
10691 Flags.setNoFPExcept(true);
10692
10693 SDLoc SL(Op);
10694 SDValue LHS = Op.getOperand(0);
10695 SDValue RHS = Op.getOperand(1);
10696
10697 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10698
10699 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10700
10701 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10702 {RHS, RHS, LHS}, Flags);
10703 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10704 {LHS, RHS, LHS}, Flags);
10705
10706 // Denominator is scaled to not be denormal, so using rcp is ok.
10707 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10708 DenominatorScaled, Flags);
10709 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10710 DenominatorScaled, Flags);
10711
10712 using namespace AMDGPU::Hwreg;
10713 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10714 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10715
10716 const MachineFunction &MF = DAG.getMachineFunction();
10718 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10719
10720 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10721 const bool HasDynamicDenormals =
10722 (DenormMode.Input == DenormalMode::Dynamic) ||
10723 (DenormMode.Output == DenormalMode::Dynamic);
10724
10725 SDValue SavedDenormMode;
10726
10727 if (!PreservesDenormals) {
10728 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10729 // lowering. The chain dependence is insufficient, and we need glue. We do
10730 // not need the glue variants in a strictfp function.
10731
10732 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10733
10734 SDValue Glue = DAG.getEntryNode();
10735 if (HasDynamicDenormals) {
10736 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10737 DAG.getVTList(MVT::i32, MVT::Glue),
10738 {BitField, Glue});
10739 SavedDenormMode = SDValue(GetReg, 0);
10740
10741 Glue = DAG.getMergeValues(
10742 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10743 }
10744
10745 SDNode *EnableDenorm;
10746 if (Subtarget->hasDenormModeInst()) {
10747 const SDValue EnableDenormValue =
10748 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10749
10750 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10751 EnableDenormValue)
10752 .getNode();
10753 } else {
10754 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10755 SL, MVT::i32);
10756 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10757 {EnableDenormValue, BitField, Glue});
10758 }
10759
10760 SDValue Ops[3] = {
10761 NegDivScale0,
10762 SDValue(EnableDenorm, 0),
10763 SDValue(EnableDenorm, 1)
10764 };
10765
10766 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10767 }
10768
10769 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10770 ApproxRcp, One, NegDivScale0, Flags);
10771
10772 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10773 ApproxRcp, Fma0, Flags);
10774
10775 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10776 Fma1, Fma1, Flags);
10777
10778 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10779 NumeratorScaled, Mul, Flags);
10780
10781 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10782 Fma2, Fma1, Mul, Fma2, Flags);
10783
10784 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10785 NumeratorScaled, Fma3, Flags);
10786
10787 if (!PreservesDenormals) {
10788 SDNode *DisableDenorm;
10789 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10790 const SDValue DisableDenormValue = getSPDenormModeValue(
10791 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10792
10793 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10794 Fma4.getValue(1), DisableDenormValue,
10795 Fma4.getValue(2)).getNode();
10796 } else {
10797 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10798 const SDValue DisableDenormValue =
10799 HasDynamicDenormals
10800 ? SavedDenormMode
10801 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10802
10803 DisableDenorm = DAG.getMachineNode(
10804 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10805 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10806 }
10807
10808 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10809 SDValue(DisableDenorm, 0), DAG.getRoot());
10810 DAG.setRoot(OutputChain);
10811 }
10812
10813 SDValue Scale = NumeratorScaled.getValue(1);
10814 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10815 {Fma4, Fma1, Fma3, Scale}, Flags);
10816
10817 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10818}
10819
10820SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10821 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10822 return FastLowered;
10823
10824 SDLoc SL(Op);
10825 SDValue X = Op.getOperand(0);
10826 SDValue Y = Op.getOperand(1);
10827
10828 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10829
10830 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10831
10832 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10833
10834 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10835
10836 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10837
10838 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10839
10840 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10841
10842 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10843
10844 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10845
10846 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10847 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10848
10849 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10850 NegDivScale0, Mul, DivScale1);
10851
10852 SDValue Scale;
10853
10854 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10855 // Workaround a hardware bug on SI where the condition output from div_scale
10856 // is not usable.
10857
10858 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10859
10860 // Figure out if the scale to use for div_fmas.
10861 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10862 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10863 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10864 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10865
10866 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10867 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10868
10869 SDValue Scale0Hi
10870 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10871 SDValue Scale1Hi
10872 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10873
10874 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10875 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10876 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10877 } else {
10878 Scale = DivScale1.getValue(1);
10879 }
10880
10881 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10882 Fma4, Fma3, Mul, Scale);
10883
10884 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10885}
10886
10887SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10888 EVT VT = Op.getValueType();
10889
10890 if (VT == MVT::f32)
10891 return LowerFDIV32(Op, DAG);
10892
10893 if (VT == MVT::f64)
10894 return LowerFDIV64(Op, DAG);
10895
10896 if (VT == MVT::f16)
10897 return LowerFDIV16(Op, DAG);
10898
10899 llvm_unreachable("Unexpected type for fdiv");
10900}
10901
10902SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10903 SDLoc dl(Op);
10904 SDValue Val = Op.getOperand(0);
10905 EVT VT = Val.getValueType();
10906 EVT ResultExpVT = Op->getValueType(1);
10907 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10908
10909 SDValue Mant = DAG.getNode(
10911 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10912
10913 SDValue Exp = DAG.getNode(
10914 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10915 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10916
10917 if (Subtarget->hasFractBug()) {
10918 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10919 SDValue Inf =
10921
10922 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10923 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10924 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10925 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10926 }
10927
10928 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10929 return DAG.getMergeValues({Mant, CastExp}, dl);
10930}
10931
10932SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10933 SDLoc DL(Op);
10935 EVT VT = Store->getMemoryVT();
10936
10937 if (VT == MVT::i1) {
10938 return DAG.getTruncStore(Store->getChain(), DL,
10939 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10940 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10941 }
10942
10943 assert(VT.isVector() &&
10944 Store->getValue().getValueType().getScalarType() == MVT::i32);
10945
10946 unsigned AS = Store->getAddressSpace();
10947 if (Subtarget->hasLDSMisalignedBug() &&
10948 AS == AMDGPUAS::FLAT_ADDRESS &&
10949 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10950 return SplitVectorStore(Op, DAG);
10951 }
10952
10955 // If there is a possibility that flat instruction access scratch memory
10956 // then we need to use the same legalization rules we use for private.
10957 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10959 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10961
10962 unsigned NumElements = VT.getVectorNumElements();
10963 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10964 AS == AMDGPUAS::FLAT_ADDRESS) {
10965 if (NumElements > 4)
10966 return SplitVectorStore(Op, DAG);
10967 // v3 stores not supported on SI.
10968 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10969 return SplitVectorStore(Op, DAG);
10970
10972 VT, *Store->getMemOperand()))
10973 return expandUnalignedStore(Store, DAG);
10974
10975 return SDValue();
10976 }
10977 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10978 switch (Subtarget->getMaxPrivateElementSize()) {
10979 case 4:
10980 return scalarizeVectorStore(Store, DAG);
10981 case 8:
10982 if (NumElements > 2)
10983 return SplitVectorStore(Op, DAG);
10984 return SDValue();
10985 case 16:
10986 if (NumElements > 4 ||
10987 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10988 return SplitVectorStore(Op, DAG);
10989 return SDValue();
10990 default:
10991 llvm_unreachable("unsupported private_element_size");
10992 }
10993 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10994 unsigned Fast = 0;
10995 auto Flags = Store->getMemOperand()->getFlags();
10997 Store->getAlign(), Flags, &Fast) &&
10998 Fast > 1)
10999 return SDValue();
11000
11001 if (VT.isVector())
11002 return SplitVectorStore(Op, DAG);
11003
11004 return expandUnalignedStore(Store, DAG);
11005 }
11006
11007 // Probably an invalid store. If so we'll end up emitting a selection error.
11008 return SDValue();
11009}
11010
11011// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11012SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11013 SDLoc SL(Op);
11014 assert(!Subtarget->has16BitInsts());
11015 SDNodeFlags Flags = Op->getFlags();
11016 SDValue Ext =
11017 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11018
11019 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11020 SDValue Sqrt =
11021 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11022
11023 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11024 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11025}
11026
11027SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11028 SDLoc DL(Op);
11029 SDNodeFlags Flags = Op->getFlags();
11030 MVT VT = Op.getValueType().getSimpleVT();
11031 const SDValue X = Op.getOperand(0);
11032
11033 if (allowApproxFunc(DAG, Flags)) {
11034 // Instruction is 1ulp but ignores denormals.
11035 return DAG.getNode(
11037 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11038 }
11039
11040 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11041 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11042
11043 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11044
11045 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11046
11047 SDValue SqrtX =
11048 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11049
11050 SDValue SqrtS;
11051 if (needsDenormHandlingF32(DAG, X, Flags)) {
11052 SDValue SqrtID =
11053 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11054 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11055
11056 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11057 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11058 DAG.getConstant(-1, DL, MVT::i32));
11059 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11060
11061 SDValue NegSqrtSNextDown =
11062 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11063
11064 SDValue SqrtVP =
11065 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11066
11067 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11068 DAG.getConstant(1, DL, MVT::i32));
11069 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11070
11071 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11072 SDValue SqrtVS =
11073 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11074
11075 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11076 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11077
11078 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11079 Flags);
11080
11081 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11082 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11083 Flags);
11084 } else {
11085 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11086
11087 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11088
11089 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11090 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11091 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11092
11093 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11094 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11095 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11096
11097 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11098 SDValue SqrtD =
11099 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11100 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11101 }
11102
11103 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11104
11105 SDValue ScaledDown =
11106 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11107
11108 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11109 SDValue IsZeroOrInf =
11110 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11111 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11112
11113 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11114}
11115
11116SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11117 // For double type, the SQRT and RSQ instructions don't have required
11118 // precision, we apply Goldschmidt's algorithm to improve the result:
11119 //
11120 // y0 = rsq(x)
11121 // g0 = x * y0
11122 // h0 = 0.5 * y0
11123 //
11124 // r0 = 0.5 - h0 * g0
11125 // g1 = g0 * r0 + g0
11126 // h1 = h0 * r0 + h0
11127 //
11128 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11129 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11130 // h2 = h1 * r1 + h1
11131 //
11132 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11133 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11134 //
11135 // sqrt(x) = g3
11136
11137 SDNodeFlags Flags = Op->getFlags();
11138
11139 SDLoc DL(Op);
11140
11141 SDValue X = Op.getOperand(0);
11142 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11143
11144 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11145
11146 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11147
11148 // Scale up input if it is too small.
11149 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11150 SDValue ScaleUp =
11151 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11152 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11153
11154 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11155
11156 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11157
11158 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11159 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11160
11161 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11162 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11163
11164 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11165
11166 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11167
11168 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11169 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11170
11171 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11172
11173 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11174 SDValue SqrtD1 =
11175 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11176
11177 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11178
11179 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11180 SDValue ScaleDown =
11181 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11182 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11183
11184 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11185 // with finite only or nsz because rsq(+/-0) = +/-inf
11186
11187 // TODO: Check for DAZ and expand to subnormals
11188 SDValue IsZeroOrInf =
11189 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11190 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11191
11192 // If x is +INF, +0, or -0, use its original value
11193 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11194 Flags);
11195}
11196
11197SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11198 SDLoc DL(Op);
11199 EVT VT = Op.getValueType();
11200 SDValue Arg = Op.getOperand(0);
11201 SDValue TrigVal;
11202
11203 // Propagate fast-math flags so that the multiply we introduce can be folded
11204 // if Arg is already the result of a multiply by constant.
11205 auto Flags = Op->getFlags();
11206
11207 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11208
11209 if (Subtarget->hasTrigReducedRange()) {
11210 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11211 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11212 } else {
11213 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11214 }
11215
11216 switch (Op.getOpcode()) {
11217 case ISD::FCOS:
11218 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11219 case ISD::FSIN:
11220 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11221 default:
11222 llvm_unreachable("Wrong trig opcode");
11223 }
11224}
11225
11226SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11227 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11228 assert(AtomicNode->isCompareAndSwap());
11229 unsigned AS = AtomicNode->getAddressSpace();
11230
11231 // No custom lowering required for local address space
11233 return Op;
11234
11235 // Non-local address space requires custom lowering for atomic compare
11236 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11237 SDLoc DL(Op);
11238 SDValue ChainIn = Op.getOperand(0);
11239 SDValue Addr = Op.getOperand(1);
11240 SDValue Old = Op.getOperand(2);
11241 SDValue New = Op.getOperand(3);
11242 EVT VT = Op.getValueType();
11243 MVT SimpleVT = VT.getSimpleVT();
11244 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11245
11246 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11247 SDValue Ops[] = { ChainIn, Addr, NewOld };
11248
11249 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11250 Ops, VT, AtomicNode->getMemOperand());
11251}
11252
11253//===----------------------------------------------------------------------===//
11254// Custom DAG optimizations
11255//===----------------------------------------------------------------------===//
11256
11257SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11258 DAGCombinerInfo &DCI) const {
11259 EVT VT = N->getValueType(0);
11260 EVT ScalarVT = VT.getScalarType();
11261 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11262 return SDValue();
11263
11264 SelectionDAG &DAG = DCI.DAG;
11265 SDLoc DL(N);
11266
11267 SDValue Src = N->getOperand(0);
11268 EVT SrcVT = Src.getValueType();
11269
11270 // TODO: We could try to match extracting the higher bytes, which would be
11271 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11272 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11273 // about in practice.
11274 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11275 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11276 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11277 DCI.AddToWorklist(Cvt.getNode());
11278
11279 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11280 if (ScalarVT != MVT::f32) {
11281 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11282 DAG.getTargetConstant(0, DL, MVT::i32));
11283 }
11284 return Cvt;
11285 }
11286 }
11287
11288 return SDValue();
11289}
11290
11291SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11292 DAGCombinerInfo &DCI) const {
11293 SDValue MagnitudeOp = N->getOperand(0);
11294 SDValue SignOp = N->getOperand(1);
11295 SelectionDAG &DAG = DCI.DAG;
11296 SDLoc DL(N);
11297
11298 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11299 // lower half with a copy.
11300 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11301 if (MagnitudeOp.getValueType() == MVT::f64) {
11302 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11303 SDValue MagLo =
11304 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11305 DAG.getConstant(0, DL, MVT::i32));
11306 SDValue MagHi =
11307 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11308 DAG.getConstant(1, DL, MVT::i32));
11309
11310 SDValue HiOp =
11311 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11312
11313 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11314
11315 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11316 }
11317
11318 if (SignOp.getValueType() != MVT::f64)
11319 return SDValue();
11320
11321 // Reduce width of sign operand, we only need the highest bit.
11322 //
11323 // fcopysign f64:x, f64:y ->
11324 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11325 // TODO: In some cases it might make sense to go all the way to f16.
11326 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11327 SDValue SignAsF32 =
11328 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11329 DAG.getConstant(1, DL, MVT::i32));
11330
11331 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11332 SignAsF32);
11333}
11334
11335// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11336// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11337// bits
11338
11339// This is a variant of
11340// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11341//
11342// The normal DAG combiner will do this, but only if the add has one use since
11343// that would increase the number of instructions.
11344//
11345// This prevents us from seeing a constant offset that can be folded into a
11346// memory instruction's addressing mode. If we know the resulting add offset of
11347// a pointer can be folded into an addressing offset, we can replace the pointer
11348// operand with the add of new constant offset. This eliminates one of the uses,
11349// and may allow the remaining use to also be simplified.
11350//
11351SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11352 unsigned AddrSpace,
11353 EVT MemVT,
11354 DAGCombinerInfo &DCI) const {
11355 SDValue N0 = N->getOperand(0);
11356 SDValue N1 = N->getOperand(1);
11357
11358 // We only do this to handle cases where it's profitable when there are
11359 // multiple uses of the add, so defer to the standard combine.
11360 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11361 N0->hasOneUse())
11362 return SDValue();
11363
11365 if (!CN1)
11366 return SDValue();
11367
11369 if (!CAdd)
11370 return SDValue();
11371
11372 SelectionDAG &DAG = DCI.DAG;
11373
11374 if (N0->getOpcode() == ISD::OR &&
11375 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11376 return SDValue();
11377
11378 // If the resulting offset is too large, we can't fold it into the
11379 // addressing mode offset.
11380 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11381 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11382
11383 AddrMode AM;
11384 AM.HasBaseReg = true;
11385 AM.BaseOffs = Offset.getSExtValue();
11386 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11387 return SDValue();
11388
11389 SDLoc SL(N);
11390 EVT VT = N->getValueType(0);
11391
11392 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11393 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11394
11395 SDNodeFlags Flags;
11396 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11397 (N0.getOpcode() == ISD::OR ||
11398 N0->getFlags().hasNoUnsignedWrap()));
11399
11400 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11401}
11402
11403/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11404/// by the chain and intrinsic ID. Theoretically we would also need to check the
11405/// specific intrinsic, but they all place the pointer operand first.
11406static unsigned getBasePtrIndex(const MemSDNode *N) {
11407 switch (N->getOpcode()) {
11408 case ISD::STORE:
11411 return 2;
11412 default:
11413 return 1;
11414 }
11415}
11416
11417SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11418 DAGCombinerInfo &DCI) const {
11419 SelectionDAG &DAG = DCI.DAG;
11420 SDLoc SL(N);
11421
11422 unsigned PtrIdx = getBasePtrIndex(N);
11423 SDValue Ptr = N->getOperand(PtrIdx);
11424
11425 // TODO: We could also do this for multiplies.
11426 if (Ptr.getOpcode() == ISD::SHL) {
11427 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11428 N->getMemoryVT(), DCI);
11429 if (NewPtr) {
11430 SmallVector<SDValue, 8> NewOps(N->ops());
11431
11432 NewOps[PtrIdx] = NewPtr;
11433 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11434 }
11435 }
11436
11437 return SDValue();
11438}
11439
11440static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11441 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11442 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11443 (Opc == ISD::XOR && Val == 0);
11444}
11445
11446// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11447// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11448// integer combine opportunities since most 64-bit operations are decomposed
11449// this way. TODO: We won't want this for SALU especially if it is an inline
11450// immediate.
11451SDValue SITargetLowering::splitBinaryBitConstantOp(
11452 DAGCombinerInfo &DCI,
11453 const SDLoc &SL,
11454 unsigned Opc, SDValue LHS,
11455 const ConstantSDNode *CRHS) const {
11456 uint64_t Val = CRHS->getZExtValue();
11457 uint32_t ValLo = Lo_32(Val);
11458 uint32_t ValHi = Hi_32(Val);
11460
11461 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11462 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11463 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11464 // If we need to materialize a 64-bit immediate, it will be split up later
11465 // anyway. Avoid creating the harder to understand 64-bit immediate
11466 // materialization.
11467 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11468 }
11469
11470 return SDValue();
11471}
11472
11474 if (V.getValueType() != MVT::i1)
11475 return false;
11476 switch (V.getOpcode()) {
11477 default:
11478 break;
11479 case ISD::SETCC:
11481 return true;
11482 case ISD::AND:
11483 case ISD::OR:
11484 case ISD::XOR:
11485 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11486 }
11487 return false;
11488}
11489
11490// If a constant has all zeroes or all ones within each byte return it.
11491// Otherwise return 0.
11493 // 0xff for any zero byte in the mask
11494 uint32_t ZeroByteMask = 0;
11495 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11496 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11497 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11498 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11499 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11500 if ((NonZeroByteMask & C) != NonZeroByteMask)
11501 return 0; // Partial bytes selected.
11502 return C;
11503}
11504
11505// Check if a node selects whole bytes from its operand 0 starting at a byte
11506// boundary while masking the rest. Returns select mask as in the v_perm_b32
11507// or -1 if not succeeded.
11508// Note byte select encoding:
11509// value 0-3 selects corresponding source byte;
11510// value 0xc selects zero;
11511// value 0xff selects 0xff.
11513 assert(V.getValueSizeInBits() == 32);
11514
11515 if (V.getNumOperands() != 2)
11516 return ~0;
11517
11518 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11519 if (!N1)
11520 return ~0;
11521
11522 uint32_t C = N1->getZExtValue();
11523
11524 switch (V.getOpcode()) {
11525 default:
11526 break;
11527 case ISD::AND:
11528 if (uint32_t ConstMask = getConstantPermuteMask(C))
11529 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11530 break;
11531
11532 case ISD::OR:
11533 if (uint32_t ConstMask = getConstantPermuteMask(C))
11534 return (0x03020100 & ~ConstMask) | ConstMask;
11535 break;
11536
11537 case ISD::SHL:
11538 if (C % 8)
11539 return ~0;
11540
11541 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11542
11543 case ISD::SRL:
11544 if (C % 8)
11545 return ~0;
11546
11547 return uint32_t(0x0c0c0c0c03020100ull >> C);
11548 }
11549
11550 return ~0;
11551}
11552
11553SDValue SITargetLowering::performAndCombine(SDNode *N,
11554 DAGCombinerInfo &DCI) const {
11555 if (DCI.isBeforeLegalize())
11556 return SDValue();
11557
11558 SelectionDAG &DAG = DCI.DAG;
11559 EVT VT = N->getValueType(0);
11560 SDValue LHS = N->getOperand(0);
11561 SDValue RHS = N->getOperand(1);
11562
11563
11564 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11565 if (VT == MVT::i64 && CRHS) {
11566 if (SDValue Split
11567 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11568 return Split;
11569 }
11570
11571 if (CRHS && VT == MVT::i32) {
11572 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11573 // nb = number of trailing zeroes in mask
11574 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11575 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11576 uint64_t Mask = CRHS->getZExtValue();
11577 unsigned Bits = llvm::popcount(Mask);
11578 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11579 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11580 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11581 unsigned Shift = CShift->getZExtValue();
11582 unsigned NB = CRHS->getAPIntValue().countr_zero();
11583 unsigned Offset = NB + Shift;
11584 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11585 SDLoc SL(N);
11586 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11587 LHS->getOperand(0),
11588 DAG.getConstant(Offset, SL, MVT::i32),
11589 DAG.getConstant(Bits, SL, MVT::i32));
11590 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11591 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11592 DAG.getValueType(NarrowVT));
11593 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11594 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11595 return Shl;
11596 }
11597 }
11598 }
11599
11600 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11601 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11602 isa<ConstantSDNode>(LHS.getOperand(2))) {
11603 uint32_t Sel = getConstantPermuteMask(Mask);
11604 if (!Sel)
11605 return SDValue();
11606
11607 // Select 0xc for all zero bytes
11608 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11609 SDLoc DL(N);
11610 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11611 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11612 }
11613 }
11614
11615 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11616 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11617 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11618 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11619 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11620
11621 SDValue X = LHS.getOperand(0);
11622 SDValue Y = RHS.getOperand(0);
11623 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11624 !isTypeLegal(X.getValueType()))
11625 return SDValue();
11626
11627 if (LCC == ISD::SETO) {
11628 if (X != LHS.getOperand(1))
11629 return SDValue();
11630
11631 if (RCC == ISD::SETUNE) {
11632 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11633 if (!C1 || !C1->isInfinity() || C1->isNegative())
11634 return SDValue();
11635
11642
11643 static_assert(((~(SIInstrFlags::S_NAN |
11646 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11647 "mask not equal");
11648
11649 SDLoc DL(N);
11650 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11651 X, DAG.getConstant(Mask, DL, MVT::i32));
11652 }
11653 }
11654 }
11655
11656 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11657 std::swap(LHS, RHS);
11658
11659 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11660 RHS.hasOneUse()) {
11661 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11662 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11663 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11664 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11665 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11666 (RHS.getOperand(0) == LHS.getOperand(0) &&
11667 LHS.getOperand(0) == LHS.getOperand(1))) {
11668 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11669 unsigned NewMask = LCC == ISD::SETO ?
11670 Mask->getZExtValue() & ~OrdMask :
11671 Mask->getZExtValue() & OrdMask;
11672
11673 SDLoc DL(N);
11674 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11675 DAG.getConstant(NewMask, DL, MVT::i32));
11676 }
11677 }
11678
11679 if (VT == MVT::i32 &&
11680 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11681 // and x, (sext cc from i1) => select cc, x, 0
11682 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11683 std::swap(LHS, RHS);
11684 if (isBoolSGPR(RHS.getOperand(0)))
11685 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11686 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11687 }
11688
11689 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11691 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11692 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11693 uint32_t LHSMask = getPermuteMask(LHS);
11694 uint32_t RHSMask = getPermuteMask(RHS);
11695 if (LHSMask != ~0u && RHSMask != ~0u) {
11696 // Canonicalize the expression in an attempt to have fewer unique masks
11697 // and therefore fewer registers used to hold the masks.
11698 if (LHSMask > RHSMask) {
11699 std::swap(LHSMask, RHSMask);
11700 std::swap(LHS, RHS);
11701 }
11702
11703 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11704 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11705 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11706 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11707
11708 // Check of we need to combine values from two sources within a byte.
11709 if (!(LHSUsedLanes & RHSUsedLanes) &&
11710 // If we select high and lower word keep it for SDWA.
11711 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11712 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11713 // Each byte in each mask is either selector mask 0-3, or has higher
11714 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11715 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11716 // mask which is not 0xff wins. By anding both masks we have a correct
11717 // result except that 0x0c shall be corrected to give 0x0c only.
11718 uint32_t Mask = LHSMask & RHSMask;
11719 for (unsigned I = 0; I < 32; I += 8) {
11720 uint32_t ByteSel = 0xff << I;
11721 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11722 Mask &= (0x0c << I) & 0xffffffff;
11723 }
11724
11725 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11726 // or 0x0c.
11727 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11728 SDLoc DL(N);
11729
11730 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11731 LHS.getOperand(0), RHS.getOperand(0),
11732 DAG.getConstant(Sel, DL, MVT::i32));
11733 }
11734 }
11735 }
11736
11737 return SDValue();
11738}
11739
11740// A key component of v_perm is a mapping between byte position of the src
11741// operands, and the byte position of the dest. To provide such, we need: 1. the
11742// node that provides x byte of the dest of the OR, and 2. the byte of the node
11743// used to provide that x byte. calculateByteProvider finds which node provides
11744// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11745// and finds an ultimate src and byte position For example: The supported
11746// LoadCombine pattern for vector loads is as follows
11747// t1
11748// or
11749// / \
11750// t2 t3
11751// zext shl
11752// | | \
11753// t4 t5 16
11754// or anyext
11755// / \ |
11756// t6 t7 t8
11757// srl shl or
11758// / | / \ / \
11759// t9 t10 t11 t12 t13 t14
11760// trunc* 8 trunc* 8 and and
11761// | | / | | \
11762// t15 t16 t17 t18 t19 t20
11763// trunc* 255 srl -256
11764// | / \
11765// t15 t15 16
11766//
11767// *In this example, the truncs are from i32->i16
11768//
11769// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11770// respectively. calculateSrcByte would find (given node) -> ultimate src &
11771// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11772// After finding the mapping, we can combine the tree into vperm t15, t16,
11773// 0x05000407
11774
11775// Find the source and byte position from a node.
11776// \p DestByte is the byte position of the dest of the or that the src
11777// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11778// dest of the or byte. \p Depth tracks how many recursive iterations we have
11779// performed.
11780static const std::optional<ByteProvider<SDValue>>
11781calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11782 unsigned Depth = 0) {
11783 // We may need to recursively traverse a series of SRLs
11784 if (Depth >= 6)
11785 return std::nullopt;
11786
11787 if (Op.getValueSizeInBits() < 8)
11788 return std::nullopt;
11789
11790 if (Op.getValueType().isVector())
11791 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11792
11793 switch (Op->getOpcode()) {
11794 case ISD::TRUNCATE: {
11795 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11796 }
11797
11798 case ISD::SIGN_EXTEND:
11799 case ISD::ZERO_EXTEND:
11801 SDValue NarrowOp = Op->getOperand(0);
11802 auto NarrowVT = NarrowOp.getValueType();
11803 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11804 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11805 NarrowVT = VTSign->getVT();
11806 }
11807 if (!NarrowVT.isByteSized())
11808 return std::nullopt;
11809 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11810
11811 if (SrcIndex >= NarrowByteWidth)
11812 return std::nullopt;
11813 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11814 }
11815
11816 case ISD::SRA:
11817 case ISD::SRL: {
11818 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11819 if (!ShiftOp)
11820 return std::nullopt;
11821
11822 uint64_t BitShift = ShiftOp->getZExtValue();
11823
11824 if (BitShift % 8 != 0)
11825 return std::nullopt;
11826
11827 SrcIndex += BitShift / 8;
11828
11829 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11830 }
11831
11832 default: {
11833 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11834 }
11835 }
11836 llvm_unreachable("fully handled switch");
11837}
11838
11839// For a byte position in the result of an Or, traverse the tree and find the
11840// node (and the byte of the node) which ultimately provides this {Or,
11841// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11842// the byte position of the Op that corresponds with the originally requested
11843// byte of the Or \p Depth tracks how many recursive iterations we have
11844// performed. \p StartingIndex is the originally requested byte of the Or
11845static const std::optional<ByteProvider<SDValue>>
11846calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11847 unsigned StartingIndex = 0) {
11848 // Finding Src tree of RHS of or typically requires at least 1 additional
11849 // depth
11850 if (Depth > 6)
11851 return std::nullopt;
11852
11853 unsigned BitWidth = Op.getScalarValueSizeInBits();
11854 if (BitWidth % 8 != 0)
11855 return std::nullopt;
11856 if (Index > BitWidth / 8 - 1)
11857 return std::nullopt;
11858
11859 bool IsVec = Op.getValueType().isVector();
11860 switch (Op.getOpcode()) {
11861 case ISD::OR: {
11862 if (IsVec)
11863 return std::nullopt;
11864
11865 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11866 StartingIndex);
11867 if (!RHS)
11868 return std::nullopt;
11869 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11870 StartingIndex);
11871 if (!LHS)
11872 return std::nullopt;
11873 // A well formed Or will have two ByteProviders for each byte, one of which
11874 // is constant zero
11875 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11876 return std::nullopt;
11877 if (!LHS || LHS->isConstantZero())
11878 return RHS;
11879 if (!RHS || RHS->isConstantZero())
11880 return LHS;
11881 return std::nullopt;
11882 }
11883
11884 case ISD::AND: {
11885 if (IsVec)
11886 return std::nullopt;
11887
11888 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11889 if (!BitMaskOp)
11890 return std::nullopt;
11891
11892 uint32_t BitMask = BitMaskOp->getZExtValue();
11893 // Bits we expect for our StartingIndex
11894 uint32_t IndexMask = 0xFF << (Index * 8);
11895
11896 if ((IndexMask & BitMask) != IndexMask) {
11897 // If the result of the and partially provides the byte, then it
11898 // is not well formatted
11899 if (IndexMask & BitMask)
11900 return std::nullopt;
11902 }
11903
11904 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11905 }
11906
11907 case ISD::FSHR: {
11908 if (IsVec)
11909 return std::nullopt;
11910
11911 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11912 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11913 if (!ShiftOp || Op.getValueType().isVector())
11914 return std::nullopt;
11915
11916 uint64_t BitsProvided = Op.getValueSizeInBits();
11917 if (BitsProvided % 8 != 0)
11918 return std::nullopt;
11919
11920 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11921 if (BitShift % 8)
11922 return std::nullopt;
11923
11924 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11925 uint64_t ByteShift = BitShift / 8;
11926
11927 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11928 uint64_t BytesProvided = BitsProvided / 8;
11929 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11930 NewIndex %= BytesProvided;
11931 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11932 }
11933
11934 case ISD::SRA:
11935 case ISD::SRL: {
11936 if (IsVec)
11937 return std::nullopt;
11938
11939 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11940 if (!ShiftOp)
11941 return std::nullopt;
11942
11943 uint64_t BitShift = ShiftOp->getZExtValue();
11944 if (BitShift % 8)
11945 return std::nullopt;
11946
11947 auto BitsProvided = Op.getScalarValueSizeInBits();
11948 if (BitsProvided % 8 != 0)
11949 return std::nullopt;
11950
11951 uint64_t BytesProvided = BitsProvided / 8;
11952 uint64_t ByteShift = BitShift / 8;
11953 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11954 // If the byte we are trying to provide (as tracked by index) falls in this
11955 // range, then the SRL provides the byte. The byte of interest of the src of
11956 // the SRL is Index + ByteShift
11957 return BytesProvided - ByteShift > Index
11958 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11959 Index + ByteShift)
11961 }
11962
11963 case ISD::SHL: {
11964 if (IsVec)
11965 return std::nullopt;
11966
11967 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11968 if (!ShiftOp)
11969 return std::nullopt;
11970
11971 uint64_t BitShift = ShiftOp->getZExtValue();
11972 if (BitShift % 8 != 0)
11973 return std::nullopt;
11974 uint64_t ByteShift = BitShift / 8;
11975
11976 // If we are shifting by an amount greater than (or equal to)
11977 // the index we are trying to provide, then it provides 0s. If not,
11978 // then this bytes are not definitively 0s, and the corresponding byte
11979 // of interest is Index - ByteShift of the src
11980 return Index < ByteShift
11982 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11983 Depth + 1, StartingIndex);
11984 }
11985 case ISD::ANY_EXTEND:
11986 case ISD::SIGN_EXTEND:
11987 case ISD::ZERO_EXTEND:
11989 case ISD::AssertZext:
11990 case ISD::AssertSext: {
11991 if (IsVec)
11992 return std::nullopt;
11993
11994 SDValue NarrowOp = Op->getOperand(0);
11995 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11996 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11997 Op->getOpcode() == ISD::AssertZext ||
11998 Op->getOpcode() == ISD::AssertSext) {
11999 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
12000 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12001 }
12002 if (NarrowBitWidth % 8 != 0)
12003 return std::nullopt;
12004 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12005
12006 if (Index >= NarrowByteWidth)
12007 return Op.getOpcode() == ISD::ZERO_EXTEND
12008 ? std::optional<ByteProvider<SDValue>>(
12010 : std::nullopt;
12011 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12012 }
12013
12014 case ISD::TRUNCATE: {
12015 if (IsVec)
12016 return std::nullopt;
12017
12018 uint64_t NarrowByteWidth = BitWidth / 8;
12019
12020 if (NarrowByteWidth >= Index) {
12021 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12022 StartingIndex);
12023 }
12024
12025 return std::nullopt;
12026 }
12027
12028 case ISD::CopyFromReg: {
12029 if (BitWidth / 8 > Index)
12030 return calculateSrcByte(Op, StartingIndex, Index);
12031
12032 return std::nullopt;
12033 }
12034
12035 case ISD::LOAD: {
12036 auto L = cast<LoadSDNode>(Op.getNode());
12037
12038 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12039 if (NarrowBitWidth % 8 != 0)
12040 return std::nullopt;
12041 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12042
12043 // If the width of the load does not reach byte we are trying to provide for
12044 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12045 // question
12046 if (Index >= NarrowByteWidth) {
12047 return L->getExtensionType() == ISD::ZEXTLOAD
12048 ? std::optional<ByteProvider<SDValue>>(
12050 : std::nullopt;
12051 }
12052
12053 if (NarrowByteWidth > Index) {
12054 return calculateSrcByte(Op, StartingIndex, Index);
12055 }
12056
12057 return std::nullopt;
12058 }
12059
12060 case ISD::BSWAP: {
12061 if (IsVec)
12062 return std::nullopt;
12063
12064 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12065 Depth + 1, StartingIndex);
12066 }
12067
12069 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12070 if (!IdxOp)
12071 return std::nullopt;
12072 auto VecIdx = IdxOp->getZExtValue();
12073 auto ScalarSize = Op.getScalarValueSizeInBits();
12074 if (ScalarSize < 32)
12075 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12076 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
12077 StartingIndex, Index);
12078 }
12079
12080 case AMDGPUISD::PERM: {
12081 if (IsVec)
12082 return std::nullopt;
12083
12084 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12085 if (!PermMask)
12086 return std::nullopt;
12087
12088 auto IdxMask =
12089 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12090 if (IdxMask > 0x07 && IdxMask != 0x0c)
12091 return std::nullopt;
12092
12093 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12094 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12095
12096 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12099 }
12100
12101 default: {
12102 return std::nullopt;
12103 }
12104 }
12105
12106 llvm_unreachable("fully handled switch");
12107}
12108
12109// Returns true if the Operand is a scalar and is 16 bits
12110static bool isExtendedFrom16Bits(SDValue &Operand) {
12111
12112 switch (Operand.getOpcode()) {
12113 case ISD::ANY_EXTEND:
12114 case ISD::SIGN_EXTEND:
12115 case ISD::ZERO_EXTEND: {
12116 auto OpVT = Operand.getOperand(0).getValueType();
12117 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12118 }
12119 case ISD::LOAD: {
12120 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12121 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12122 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12123 ExtType == ISD::EXTLOAD) {
12124 auto MemVT = L->getMemoryVT();
12125 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12126 }
12127 return L->getMemoryVT().getSizeInBits() == 16;
12128 }
12129 default:
12130 return false;
12131 }
12132}
12133
12134// Returns true if the mask matches consecutive bytes, and the first byte
12135// begins at a power of 2 byte offset from 0th byte
12136static bool addresses16Bits(int Mask) {
12137 int Low8 = Mask & 0xff;
12138 int Hi8 = (Mask & 0xff00) >> 8;
12139
12140 assert(Low8 < 8 && Hi8 < 8);
12141 // Are the bytes contiguous in the order of increasing addresses.
12142 bool IsConsecutive = (Hi8 - Low8 == 1);
12143 // Is the first byte at location that is aligned for 16 bit instructions.
12144 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12145 // In this case, we still need code to extract the 16 bit operand, so it
12146 // is better to use i8 v_perm
12147 bool Is16Aligned = !(Low8 % 2);
12148
12149 return IsConsecutive && Is16Aligned;
12150}
12151
12152// Do not lower into v_perm if the operands are actually 16 bit
12153// and the selected bits (based on PermMask) correspond with two
12154// easily addressable 16 bit operands.
12156 SDValue &OtherOp) {
12157 int Low16 = PermMask & 0xffff;
12158 int Hi16 = (PermMask & 0xffff0000) >> 16;
12159
12160 auto TempOp = peekThroughBitcasts(Op);
12161 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12162
12163 auto OpIs16Bit =
12164 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12165 if (!OpIs16Bit)
12166 return true;
12167
12168 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12169 isExtendedFrom16Bits(TempOtherOp);
12170 if (!OtherOpIs16Bit)
12171 return true;
12172
12173 // Do we cleanly address both
12174 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12175}
12176
12178 unsigned DWordOffset) {
12179 SDValue Ret;
12180
12181 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12182 // ByteProvider must be at least 8 bits
12183 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12184
12185 if (TypeSize <= 32)
12186 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12187
12188 if (Src.getValueType().isVector()) {
12189 auto ScalarTySize = Src.getScalarValueSizeInBits();
12190 auto ScalarTy = Src.getValueType().getScalarType();
12191 if (ScalarTySize == 32) {
12192 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12193 DAG.getConstant(DWordOffset, SL, MVT::i32));
12194 }
12195 if (ScalarTySize > 32) {
12196 Ret = DAG.getNode(
12197 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12198 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12199 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12200 if (ShiftVal)
12201 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12202 DAG.getConstant(ShiftVal, SL, MVT::i32));
12203 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12204 }
12205
12206 assert(ScalarTySize < 32);
12207 auto NumElements = TypeSize / ScalarTySize;
12208 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12209 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12210 auto NumElementsIn32 = 32 / ScalarTySize;
12211 auto NumAvailElements = DWordOffset < Trunc32Elements
12212 ? NumElementsIn32
12213 : NumElements - NormalizedTrunc;
12214
12216 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12217 NumAvailElements);
12218
12219 Ret = DAG.getBuildVector(
12220 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12221 VecSrcs);
12222 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12223 }
12224
12225 /// Scalar Type
12226 auto ShiftVal = 32 * DWordOffset;
12227 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12228 DAG.getConstant(ShiftVal, SL, MVT::i32));
12229 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12230}
12231
12233 SelectionDAG &DAG = DCI.DAG;
12234 [[maybe_unused]] EVT VT = N->getValueType(0);
12236
12237 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12238 assert(VT == MVT::i32);
12239 for (int i = 0; i < 4; i++) {
12240 // Find the ByteProvider that provides the ith byte of the result of OR
12241 std::optional<ByteProvider<SDValue>> P =
12242 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12243 // TODO support constantZero
12244 if (!P || P->isConstantZero())
12245 return SDValue();
12246
12247 PermNodes.push_back(*P);
12248 }
12249 if (PermNodes.size() != 4)
12250 return SDValue();
12251
12252 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12253 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12254 uint64_t PermMask = 0x00000000;
12255 for (size_t i = 0; i < PermNodes.size(); i++) {
12256 auto PermOp = PermNodes[i];
12257 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12258 // by sizeof(Src2) = 4
12259 int SrcByteAdjust = 4;
12260
12261 // If the Src uses a byte from a different DWORD, then it corresponds
12262 // with a difference source
12263 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12264 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12265 if (SecondSrc)
12266 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12267 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12268 return SDValue();
12269
12270 // Set the index of the second distinct Src node
12271 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12272 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12273 SrcByteAdjust = 0;
12274 }
12275 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12277 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12278 }
12279 SDLoc DL(N);
12280 SDValue Op = *PermNodes[FirstSrc.first].Src;
12281 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12282 assert(Op.getValueSizeInBits() == 32);
12283
12284 // Check that we are not just extracting the bytes in order from an op
12285 if (!SecondSrc) {
12286 int Low16 = PermMask & 0xffff;
12287 int Hi16 = (PermMask & 0xffff0000) >> 16;
12288
12289 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12290 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12291
12292 // The perm op would really just produce Op. So combine into Op
12293 if (WellFormedLow && WellFormedHi)
12294 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12295 }
12296
12297 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12298
12299 if (SecondSrc) {
12300 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12301 assert(OtherOp.getValueSizeInBits() == 32);
12302 }
12303
12304 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12305
12306 assert(Op.getValueType().isByteSized() &&
12307 OtherOp.getValueType().isByteSized());
12308
12309 // If the ultimate src is less than 32 bits, then we will only be
12310 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12311 // CalculateByteProvider would not have returned Op as source if we
12312 // used a byte that is outside its ValueType. Thus, we are free to
12313 // ANY_EXTEND as the extended bits are dont-cares.
12314 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12315 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12316
12317 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12318 DAG.getConstant(PermMask, DL, MVT::i32));
12319 }
12320 return SDValue();
12321}
12322
12323SDValue SITargetLowering::performOrCombine(SDNode *N,
12324 DAGCombinerInfo &DCI) const {
12325 SelectionDAG &DAG = DCI.DAG;
12326 SDValue LHS = N->getOperand(0);
12327 SDValue RHS = N->getOperand(1);
12328
12329 EVT VT = N->getValueType(0);
12330 if (VT == MVT::i1) {
12331 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12332 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12333 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12334 SDValue Src = LHS.getOperand(0);
12335 if (Src != RHS.getOperand(0))
12336 return SDValue();
12337
12338 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12339 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12340 if (!CLHS || !CRHS)
12341 return SDValue();
12342
12343 // Only 10 bits are used.
12344 static const uint32_t MaxMask = 0x3ff;
12345
12346 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12347 SDLoc DL(N);
12348 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12349 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12350 }
12351
12352 return SDValue();
12353 }
12354
12355 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12356 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12357 LHS.getOpcode() == AMDGPUISD::PERM &&
12358 isa<ConstantSDNode>(LHS.getOperand(2))) {
12359 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12360 if (!Sel)
12361 return SDValue();
12362
12363 Sel |= LHS.getConstantOperandVal(2);
12364 SDLoc DL(N);
12365 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12366 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12367 }
12368
12369 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12371 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12372 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12373
12374 // If all the uses of an or need to extract the individual elements, do not
12375 // attempt to lower into v_perm
12376 auto usesCombinedOperand = [](SDNode *OrUse) {
12377 // If we have any non-vectorized use, then it is a candidate for v_perm
12378 if (OrUse->getOpcode() != ISD::BITCAST ||
12379 !OrUse->getValueType(0).isVector())
12380 return true;
12381
12382 // If we have any non-vectorized use, then it is a candidate for v_perm
12383 for (auto VUse : OrUse->uses()) {
12384 if (!VUse->getValueType(0).isVector())
12385 return true;
12386
12387 // If the use of a vector is a store, then combining via a v_perm
12388 // is beneficial.
12389 // TODO -- whitelist more uses
12390 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12391 if (VUse->getOpcode() == VectorwiseOp)
12392 return true;
12393 }
12394 return false;
12395 };
12396
12397 if (!any_of(N->uses(), usesCombinedOperand))
12398 return SDValue();
12399
12400 uint32_t LHSMask = getPermuteMask(LHS);
12401 uint32_t RHSMask = getPermuteMask(RHS);
12402
12403 if (LHSMask != ~0u && RHSMask != ~0u) {
12404 // Canonicalize the expression in an attempt to have fewer unique masks
12405 // and therefore fewer registers used to hold the masks.
12406 if (LHSMask > RHSMask) {
12407 std::swap(LHSMask, RHSMask);
12408 std::swap(LHS, RHS);
12409 }
12410
12411 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12412 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12413 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12414 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12415
12416 // Check of we need to combine values from two sources within a byte.
12417 if (!(LHSUsedLanes & RHSUsedLanes) &&
12418 // If we select high and lower word keep it for SDWA.
12419 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12420 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12421 // Kill zero bytes selected by other mask. Zero value is 0xc.
12422 LHSMask &= ~RHSUsedLanes;
12423 RHSMask &= ~LHSUsedLanes;
12424 // Add 4 to each active LHS lane
12425 LHSMask |= LHSUsedLanes & 0x04040404;
12426 // Combine masks
12427 uint32_t Sel = LHSMask | RHSMask;
12428 SDLoc DL(N);
12429
12430 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12431 LHS.getOperand(0), RHS.getOperand(0),
12432 DAG.getConstant(Sel, DL, MVT::i32));
12433 }
12434 }
12435 if (LHSMask == ~0u || RHSMask == ~0u) {
12436 if (SDValue Perm = matchPERM(N, DCI))
12437 return Perm;
12438 }
12439 }
12440
12441 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12442 return SDValue();
12443
12444 // TODO: This could be a generic combine with a predicate for extracting the
12445 // high half of an integer being free.
12446
12447 // (or i64:x, (zero_extend i32:y)) ->
12448 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12449 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12450 RHS.getOpcode() != ISD::ZERO_EXTEND)
12451 std::swap(LHS, RHS);
12452
12453 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12454 SDValue ExtSrc = RHS.getOperand(0);
12455 EVT SrcVT = ExtSrc.getValueType();
12456 if (SrcVT == MVT::i32) {
12457 SDLoc SL(N);
12458 SDValue LowLHS, HiBits;
12459 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12460 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12461
12462 DCI.AddToWorklist(LowOr.getNode());
12463 DCI.AddToWorklist(HiBits.getNode());
12464
12465 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12466 LowOr, HiBits);
12467 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12468 }
12469 }
12470
12471 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12472 if (CRHS) {
12473 if (SDValue Split
12474 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12475 N->getOperand(0), CRHS))
12476 return Split;
12477 }
12478
12479 return SDValue();
12480}
12481
12482SDValue SITargetLowering::performXorCombine(SDNode *N,
12483 DAGCombinerInfo &DCI) const {
12484 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12485 return RV;
12486
12487 SDValue LHS = N->getOperand(0);
12488 SDValue RHS = N->getOperand(1);
12489
12490 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12491 SelectionDAG &DAG = DCI.DAG;
12492
12493 EVT VT = N->getValueType(0);
12494 if (CRHS && VT == MVT::i64) {
12495 if (SDValue Split
12496 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12497 return Split;
12498 }
12499
12500 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12501 // fneg-like xors into 64-bit select.
12502 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12503 // This looks like an fneg, try to fold as a source modifier.
12504 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12505 shouldFoldFNegIntoSrc(N, LHS)) {
12506 // xor (select c, a, b), 0x80000000 ->
12507 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12508 SDLoc DL(N);
12509 SDValue CastLHS =
12510 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12511 SDValue CastRHS =
12512 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12513 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12514 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12515 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12516 LHS->getOperand(0), FNegLHS, FNegRHS);
12517 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12518 }
12519 }
12520
12521 return SDValue();
12522}
12523
12524SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12525 DAGCombinerInfo &DCI) const {
12526 if (!Subtarget->has16BitInsts() ||
12527 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12528 return SDValue();
12529
12530 EVT VT = N->getValueType(0);
12531 if (VT != MVT::i32)
12532 return SDValue();
12533
12534 SDValue Src = N->getOperand(0);
12535 if (Src.getValueType() != MVT::i16)
12536 return SDValue();
12537
12538 return SDValue();
12539}
12540
12541SDValue
12542SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12543 DAGCombinerInfo &DCI) const {
12544 SDValue Src = N->getOperand(0);
12545 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12546
12547 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12548 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12549 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12550 VTSign->getVT() == MVT::i8) ||
12551 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12552 VTSign->getVT() == MVT::i16))) {
12553 assert(Subtarget->hasScalarSubwordLoads() &&
12554 "s_buffer_load_{u8, i8} are supported "
12555 "in GFX12 (or newer) architectures.");
12556 EVT VT = Src.getValueType();
12557 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12560 SDLoc DL(N);
12561 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12562 SDValue Ops[] = {
12563 Src.getOperand(0), // source register
12564 Src.getOperand(1), // offset
12565 Src.getOperand(2) // cachePolicy
12566 };
12567 auto *M = cast<MemSDNode>(Src);
12568 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12569 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12570 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12571 return LoadVal;
12572 }
12573 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12574 VTSign->getVT() == MVT::i8) ||
12575 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12576 VTSign->getVT() == MVT::i16)) &&
12577 Src.hasOneUse()) {
12578 auto *M = cast<MemSDNode>(Src);
12579 SDValue Ops[] = {
12580 Src.getOperand(0), // Chain
12581 Src.getOperand(1), // rsrc
12582 Src.getOperand(2), // vindex
12583 Src.getOperand(3), // voffset
12584 Src.getOperand(4), // soffset
12585 Src.getOperand(5), // offset
12586 Src.getOperand(6),
12587 Src.getOperand(7)
12588 };
12589 // replace with BUFFER_LOAD_BYTE/SHORT
12590 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12591 Src.getOperand(0).getValueType());
12592 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12594 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12595 ResList,
12596 Ops, M->getMemoryVT(),
12597 M->getMemOperand());
12598 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12599 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12600 }
12601 return SDValue();
12602}
12603
12604SDValue SITargetLowering::performClassCombine(SDNode *N,
12605 DAGCombinerInfo &DCI) const {
12606 SelectionDAG &DAG = DCI.DAG;
12607 SDValue Mask = N->getOperand(1);
12608
12609 // fp_class x, 0 -> false
12610 if (isNullConstant(Mask))
12611 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12612
12613 if (N->getOperand(0).isUndef())
12614 return DAG.getUNDEF(MVT::i1);
12615
12616 return SDValue();
12617}
12618
12619SDValue SITargetLowering::performRcpCombine(SDNode *N,
12620 DAGCombinerInfo &DCI) const {
12621 EVT VT = N->getValueType(0);
12622 SDValue N0 = N->getOperand(0);
12623
12624 if (N0.isUndef()) {
12625 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
12626 SDLoc(N), VT);
12627 }
12628
12629 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12630 N0.getOpcode() == ISD::SINT_TO_FP)) {
12631 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12632 N->getFlags());
12633 }
12634
12635 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12636 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12637 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12638 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12639 N0.getOperand(0), N->getFlags());
12640 }
12641
12643}
12644
12646 unsigned MaxDepth) const {
12647 unsigned Opcode = Op.getOpcode();
12648 if (Opcode == ISD::FCANONICALIZE)
12649 return true;
12650
12651 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12652 const auto &F = CFP->getValueAPF();
12653 if (F.isNaN() && F.isSignaling())
12654 return false;
12655 if (!F.isDenormal())
12656 return true;
12657
12658 DenormalMode Mode =
12659 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12660 return Mode == DenormalMode::getIEEE();
12661 }
12662
12663 // If source is a result of another standard FP operation it is already in
12664 // canonical form.
12665 if (MaxDepth == 0)
12666 return false;
12667
12668 switch (Opcode) {
12669 // These will flush denorms if required.
12670 case ISD::FADD:
12671 case ISD::FSUB:
12672 case ISD::FMUL:
12673 case ISD::FCEIL:
12674 case ISD::FFLOOR:
12675 case ISD::FMA:
12676 case ISD::FMAD:
12677 case ISD::FSQRT:
12678 case ISD::FDIV:
12679 case ISD::FREM:
12680 case ISD::FP_ROUND:
12681 case ISD::FP_EXTEND:
12682 case ISD::FP16_TO_FP:
12683 case ISD::FP_TO_FP16:
12684 case ISD::BF16_TO_FP:
12685 case ISD::FP_TO_BF16:
12686 case ISD::FLDEXP:
12689 case AMDGPUISD::RCP:
12690 case AMDGPUISD::RSQ:
12694 case AMDGPUISD::LOG:
12695 case AMDGPUISD::EXP:
12699 case AMDGPUISD::FRACT:
12706 case AMDGPUISD::SIN_HW:
12707 case AMDGPUISD::COS_HW:
12708 return true;
12709
12710 // It can/will be lowered or combined as a bit operation.
12711 // Need to check their input recursively to handle.
12712 case ISD::FNEG:
12713 case ISD::FABS:
12714 case ISD::FCOPYSIGN:
12715 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12716
12717 case ISD::AND:
12718 if (Op.getValueType() == MVT::i32) {
12719 // Be careful as we only know it is a bitcast floating point type. It
12720 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12721 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12722 // is valid to optimize for all types.
12723 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12724 if (RHS->getZExtValue() == 0xffff0000) {
12725 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12726 }
12727 }
12728 }
12729 break;
12730
12731 case ISD::FSIN:
12732 case ISD::FCOS:
12733 case ISD::FSINCOS:
12734 return Op.getValueType().getScalarType() != MVT::f16;
12735
12736 case ISD::FMINNUM:
12737 case ISD::FMAXNUM:
12738 case ISD::FMINNUM_IEEE:
12739 case ISD::FMAXNUM_IEEE:
12740 case ISD::FMINIMUM:
12741 case ISD::FMAXIMUM:
12742 case AMDGPUISD::CLAMP:
12743 case AMDGPUISD::FMED3:
12744 case AMDGPUISD::FMAX3:
12745 case AMDGPUISD::FMIN3:
12747 case AMDGPUISD::FMINIMUM3: {
12748 // FIXME: Shouldn't treat the generic operations different based these.
12749 // However, we aren't really required to flush the result from
12750 // minnum/maxnum..
12751
12752 // snans will be quieted, so we only need to worry about denormals.
12753 if (Subtarget->supportsMinMaxDenormModes() ||
12754 // FIXME: denormalsEnabledForType is broken for dynamic
12755 denormalsEnabledForType(DAG, Op.getValueType()))
12756 return true;
12757
12758 // Flushing may be required.
12759 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12760 // targets need to check their input recursively.
12761
12762 // FIXME: Does this apply with clamp? It's implemented with max.
12763 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12764 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12765 return false;
12766 }
12767
12768 return true;
12769 }
12770 case ISD::SELECT: {
12771 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12772 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12773 }
12774 case ISD::BUILD_VECTOR: {
12775 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12776 SDValue SrcOp = Op.getOperand(i);
12777 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12778 return false;
12779 }
12780
12781 return true;
12782 }
12785 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12786 }
12788 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12789 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12790 }
12791 case ISD::UNDEF:
12792 // Could be anything.
12793 return false;
12794
12795 case ISD::BITCAST:
12796 // TODO: This is incorrect as it loses track of the operand's type. We may
12797 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12798 // same bits that are canonicalized in one type need not be in the other.
12799 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12800 case ISD::TRUNCATE: {
12801 // Hack round the mess we make when legalizing extract_vector_elt
12802 if (Op.getValueType() == MVT::i16) {
12803 SDValue TruncSrc = Op.getOperand(0);
12804 if (TruncSrc.getValueType() == MVT::i32 &&
12805 TruncSrc.getOpcode() == ISD::BITCAST &&
12806 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12807 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12808 }
12809 }
12810 return false;
12811 }
12813 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12814 // TODO: Handle more intrinsics
12815 switch (IntrinsicID) {
12816 case Intrinsic::amdgcn_cvt_pkrtz:
12817 case Intrinsic::amdgcn_cubeid:
12818 case Intrinsic::amdgcn_frexp_mant:
12819 case Intrinsic::amdgcn_fdot2:
12820 case Intrinsic::amdgcn_rcp:
12821 case Intrinsic::amdgcn_rsq:
12822 case Intrinsic::amdgcn_rsq_clamp:
12823 case Intrinsic::amdgcn_rcp_legacy:
12824 case Intrinsic::amdgcn_rsq_legacy:
12825 case Intrinsic::amdgcn_trig_preop:
12826 case Intrinsic::amdgcn_log:
12827 case Intrinsic::amdgcn_exp2:
12828 case Intrinsic::amdgcn_sqrt:
12829 return true;
12830 default:
12831 break;
12832 }
12833
12834 break;
12835 }
12836 default:
12837 break;
12838 }
12839
12840 // FIXME: denormalsEnabledForType is broken for dynamic
12841 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12842 DAG.isKnownNeverSNaN(Op);
12843}
12844
12846 unsigned MaxDepth) const {
12847 const MachineRegisterInfo &MRI = MF.getRegInfo();
12848 MachineInstr *MI = MRI.getVRegDef(Reg);
12849 unsigned Opcode = MI->getOpcode();
12850
12851 if (Opcode == AMDGPU::G_FCANONICALIZE)
12852 return true;
12853
12854 std::optional<FPValueAndVReg> FCR;
12855 // Constant splat (can be padded with undef) or scalar constant.
12856 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12857 if (FCR->Value.isSignaling())
12858 return false;
12859 if (!FCR->Value.isDenormal())
12860 return true;
12861
12862 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12863 return Mode == DenormalMode::getIEEE();
12864 }
12865
12866 if (MaxDepth == 0)
12867 return false;
12868
12869 switch (Opcode) {
12870 case AMDGPU::G_FADD:
12871 case AMDGPU::G_FSUB:
12872 case AMDGPU::G_FMUL:
12873 case AMDGPU::G_FCEIL:
12874 case AMDGPU::G_FFLOOR:
12875 case AMDGPU::G_FRINT:
12876 case AMDGPU::G_FNEARBYINT:
12877 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12878 case AMDGPU::G_INTRINSIC_TRUNC:
12879 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12880 case AMDGPU::G_FMA:
12881 case AMDGPU::G_FMAD:
12882 case AMDGPU::G_FSQRT:
12883 case AMDGPU::G_FDIV:
12884 case AMDGPU::G_FREM:
12885 case AMDGPU::G_FPOW:
12886 case AMDGPU::G_FPEXT:
12887 case AMDGPU::G_FLOG:
12888 case AMDGPU::G_FLOG2:
12889 case AMDGPU::G_FLOG10:
12890 case AMDGPU::G_FPTRUNC:
12891 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12892 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12893 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12894 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12895 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12896 return true;
12897 case AMDGPU::G_FNEG:
12898 case AMDGPU::G_FABS:
12899 case AMDGPU::G_FCOPYSIGN:
12900 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12901 case AMDGPU::G_FMINNUM:
12902 case AMDGPU::G_FMAXNUM:
12903 case AMDGPU::G_FMINNUM_IEEE:
12904 case AMDGPU::G_FMAXNUM_IEEE:
12905 case AMDGPU::G_FMINIMUM:
12906 case AMDGPU::G_FMAXIMUM: {
12907 if (Subtarget->supportsMinMaxDenormModes() ||
12908 // FIXME: denormalsEnabledForType is broken for dynamic
12909 denormalsEnabledForType(MRI.getType(Reg), MF))
12910 return true;
12911
12912 [[fallthrough]];
12913 }
12914 case AMDGPU::G_BUILD_VECTOR:
12915 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12916 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12917 return false;
12918 return true;
12919 case AMDGPU::G_INTRINSIC:
12920 case AMDGPU::G_INTRINSIC_CONVERGENT:
12921 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12922 case Intrinsic::amdgcn_fmul_legacy:
12923 case Intrinsic::amdgcn_fmad_ftz:
12924 case Intrinsic::amdgcn_sqrt:
12925 case Intrinsic::amdgcn_fmed3:
12926 case Intrinsic::amdgcn_sin:
12927 case Intrinsic::amdgcn_cos:
12928 case Intrinsic::amdgcn_log:
12929 case Intrinsic::amdgcn_exp2:
12930 case Intrinsic::amdgcn_log_clamp:
12931 case Intrinsic::amdgcn_rcp:
12932 case Intrinsic::amdgcn_rcp_legacy:
12933 case Intrinsic::amdgcn_rsq:
12934 case Intrinsic::amdgcn_rsq_clamp:
12935 case Intrinsic::amdgcn_rsq_legacy:
12936 case Intrinsic::amdgcn_div_scale:
12937 case Intrinsic::amdgcn_div_fmas:
12938 case Intrinsic::amdgcn_div_fixup:
12939 case Intrinsic::amdgcn_fract:
12940 case Intrinsic::amdgcn_cvt_pkrtz:
12941 case Intrinsic::amdgcn_cubeid:
12942 case Intrinsic::amdgcn_cubema:
12943 case Intrinsic::amdgcn_cubesc:
12944 case Intrinsic::amdgcn_cubetc:
12945 case Intrinsic::amdgcn_frexp_mant:
12946 case Intrinsic::amdgcn_fdot2:
12947 case Intrinsic::amdgcn_trig_preop:
12948 return true;
12949 default:
12950 break;
12951 }
12952
12953 [[fallthrough]];
12954 default:
12955 return false;
12956 }
12957
12958 llvm_unreachable("invalid operation");
12959}
12960
12961// Constant fold canonicalize.
12962SDValue SITargetLowering::getCanonicalConstantFP(
12963 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12964 // Flush denormals to 0 if not enabled.
12965 if (C.isDenormal()) {
12966 DenormalMode Mode =
12967 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12968 if (Mode == DenormalMode::getPreserveSign()) {
12969 return DAG.getConstantFP(
12970 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12971 }
12972
12973 if (Mode != DenormalMode::getIEEE())
12974 return SDValue();
12975 }
12976
12977 if (C.isNaN()) {
12978 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12979 if (C.isSignaling()) {
12980 // Quiet a signaling NaN.
12981 // FIXME: Is this supposed to preserve payload bits?
12982 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12983 }
12984
12985 // Make sure it is the canonical NaN bitpattern.
12986 //
12987 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12988 // immediate?
12989 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12990 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12991 }
12992
12993 // Already canonical.
12994 return DAG.getConstantFP(C, SL, VT);
12995}
12996
12998 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
12999}
13000
13001SDValue SITargetLowering::performFCanonicalizeCombine(
13002 SDNode *N,
13003 DAGCombinerInfo &DCI) const {
13004 SelectionDAG &DAG = DCI.DAG;
13005 SDValue N0 = N->getOperand(0);
13006 EVT VT = N->getValueType(0);
13007
13008 // fcanonicalize undef -> qnan
13009 if (N0.isUndef()) {
13011 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13012 }
13013
13014 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13015 EVT VT = N->getValueType(0);
13016 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13017 }
13018
13019 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13020 // (fcanonicalize k)
13021 //
13022 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13023
13024 // TODO: This could be better with wider vectors that will be split to v2f16,
13025 // and to consider uses since there aren't that many packed operations.
13026 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13027 isTypeLegal(MVT::v2f16)) {
13028 SDLoc SL(N);
13029 SDValue NewElts[2];
13030 SDValue Lo = N0.getOperand(0);
13031 SDValue Hi = N0.getOperand(1);
13032 EVT EltVT = Lo.getValueType();
13033
13035 for (unsigned I = 0; I != 2; ++I) {
13036 SDValue Op = N0.getOperand(I);
13038 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
13039 CFP->getValueAPF());
13040 } else if (Op.isUndef()) {
13041 // Handled below based on what the other operand is.
13042 NewElts[I] = Op;
13043 } else {
13044 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13045 }
13046 }
13047
13048 // If one half is undef, and one is constant, prefer a splat vector rather
13049 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13050 // cheaper to use and may be free with a packed operation.
13051 if (NewElts[0].isUndef()) {
13052 if (isa<ConstantFPSDNode>(NewElts[1]))
13053 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13054 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13055 }
13056
13057 if (NewElts[1].isUndef()) {
13058 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13059 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13060 }
13061
13062 return DAG.getBuildVector(VT, SL, NewElts);
13063 }
13064 }
13065
13066 return SDValue();
13067}
13068
13069static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13070 switch (Opc) {
13071 case ISD::FMAXNUM:
13072 case ISD::FMAXNUM_IEEE:
13073 return AMDGPUISD::FMAX3;
13074 case ISD::FMAXIMUM:
13075 return AMDGPUISD::FMAXIMUM3;
13076 case ISD::SMAX:
13077 return AMDGPUISD::SMAX3;
13078 case ISD::UMAX:
13079 return AMDGPUISD::UMAX3;
13080 case ISD::FMINNUM:
13081 case ISD::FMINNUM_IEEE:
13082 return AMDGPUISD::FMIN3;
13083 case ISD::FMINIMUM:
13084 return AMDGPUISD::FMINIMUM3;
13085 case ISD::SMIN:
13086 return AMDGPUISD::SMIN3;
13087 case ISD::UMIN:
13088 return AMDGPUISD::UMIN3;
13089 default:
13090 llvm_unreachable("Not a min/max opcode");
13091 }
13092}
13093
13094SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13095 const SDLoc &SL, SDValue Src,
13096 SDValue MinVal,
13097 SDValue MaxVal,
13098 bool Signed) const {
13099
13100 // med3 comes from
13101 // min(max(x, K0), K1), K0 < K1
13102 // max(min(x, K0), K1), K1 < K0
13103 //
13104 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13105 // min/max op.
13108
13109 if (!MinK || !MaxK)
13110 return SDValue();
13111
13112 if (Signed) {
13113 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13114 return SDValue();
13115 } else {
13116 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13117 return SDValue();
13118 }
13119
13120 EVT VT = MinK->getValueType(0);
13121 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13122 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13123 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13124
13125 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13126 // not available, but this is unlikely to be profitable as constants
13127 // will often need to be materialized & extended, especially on
13128 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13129 return SDValue();
13130}
13131
13134 return C;
13135
13137 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13138 return C;
13139 }
13140
13141 return nullptr;
13142}
13143
13144SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13145 const SDLoc &SL,
13146 SDValue Op0,
13147 SDValue Op1) const {
13149 if (!K1)
13150 return SDValue();
13151
13153 if (!K0)
13154 return SDValue();
13155
13156 // Ordered >= (although NaN inputs should have folded away by now).
13157 if (K0->getValueAPF() > K1->getValueAPF())
13158 return SDValue();
13159
13160 const MachineFunction &MF = DAG.getMachineFunction();
13162
13163 // TODO: Check IEEE bit enabled?
13164 EVT VT = Op0.getValueType();
13165 if (Info->getMode().DX10Clamp) {
13166 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13167 // hardware fmed3 behavior converting to a min.
13168 // FIXME: Should this be allowing -0.0?
13169 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13170 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13171 }
13172
13173 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13174 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13175 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13176 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13177 // then give the other result, which is different from med3 with a NaN
13178 // input.
13179 SDValue Var = Op0.getOperand(0);
13180 if (!DAG.isKnownNeverSNaN(Var))
13181 return SDValue();
13182
13184
13185 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13186 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13187 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13188 Var, SDValue(K0, 0), SDValue(K1, 0));
13189 }
13190 }
13191
13192 return SDValue();
13193}
13194
13195/// \return true if the subtarget supports minimum3 and maximum3 with the given
13196/// base min/max opcode \p Opc for type \p VT.
13197static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
13198 EVT VT) {
13199 switch (Opc) {
13200 case ISD::FMINNUM:
13201 case ISD::FMAXNUM:
13202 case ISD::FMINNUM_IEEE:
13203 case ISD::FMAXNUM_IEEE:
13206 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16());
13207 case ISD::FMINIMUM:
13208 case ISD::FMAXIMUM:
13209 return (VT == MVT::f32 || VT == MVT::f16) && Subtarget.hasIEEEMinMax3();
13210 case ISD::SMAX:
13211 case ISD::SMIN:
13212 case ISD::UMAX:
13213 case ISD::UMIN:
13214 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
13215 default:
13216 return false;
13217 }
13218
13219 llvm_unreachable("not a min/max opcode");
13220}
13221
13222SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13223 DAGCombinerInfo &DCI) const {
13224 SelectionDAG &DAG = DCI.DAG;
13225
13226 EVT VT = N->getValueType(0);
13227 unsigned Opc = N->getOpcode();
13228 SDValue Op0 = N->getOperand(0);
13229 SDValue Op1 = N->getOperand(1);
13230
13231 // Only do this if the inner op has one use since this will just increases
13232 // register pressure for no benefit.
13233
13234 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
13235 // max(max(a, b), c) -> max3(a, b, c)
13236 // min(min(a, b), c) -> min3(a, b, c)
13237 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13238 SDLoc DL(N);
13239 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13240 DL,
13241 N->getValueType(0),
13242 Op0.getOperand(0),
13243 Op0.getOperand(1),
13244 Op1);
13245 }
13246
13247 // Try commuted.
13248 // max(a, max(b, c)) -> max3(a, b, c)
13249 // min(a, min(b, c)) -> min3(a, b, c)
13250 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13251 SDLoc DL(N);
13252 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13253 DL,
13254 N->getValueType(0),
13255 Op0,
13256 Op1.getOperand(0),
13257 Op1.getOperand(1));
13258 }
13259 }
13260
13261 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13262 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13263 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13264 if (SDValue Med3 = performIntMed3ImmCombine(
13265 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13266 return Med3;
13267 }
13268 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13269 if (SDValue Med3 = performIntMed3ImmCombine(
13270 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13271 return Med3;
13272 }
13273
13274 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13275 if (SDValue Med3 = performIntMed3ImmCombine(
13276 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13277 return Med3;
13278 }
13279 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13280 if (SDValue Med3 = performIntMed3ImmCombine(
13281 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13282 return Med3;
13283 }
13284
13285 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13286 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13287 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13288 (Opc == AMDGPUISD::FMIN_LEGACY &&
13289 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13290 (VT == MVT::f32 || VT == MVT::f64 ||
13291 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13292 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13293 Op0.hasOneUse()) {
13294 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13295 return Res;
13296 }
13297
13298 return SDValue();
13299}
13300
13304 // FIXME: Should this be allowing -0.0?
13305 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13306 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13307 }
13308 }
13309
13310 return false;
13311}
13312
13313// FIXME: Should only worry about snans for version with chain.
13314SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13315 DAGCombinerInfo &DCI) const {
13316 EVT VT = N->getValueType(0);
13317 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13318 // NaNs. With a NaN input, the order of the operands may change the result.
13319
13320 SelectionDAG &DAG = DCI.DAG;
13321 SDLoc SL(N);
13322
13323 SDValue Src0 = N->getOperand(0);
13324 SDValue Src1 = N->getOperand(1);
13325 SDValue Src2 = N->getOperand(2);
13326
13327 if (isClampZeroToOne(Src0, Src1)) {
13328 // const_a, const_b, x -> clamp is safe in all cases including signaling
13329 // nans.
13330 // FIXME: Should this be allowing -0.0?
13331 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13332 }
13333
13334 const MachineFunction &MF = DAG.getMachineFunction();
13336
13337 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13338 // handling no dx10-clamp?
13339 if (Info->getMode().DX10Clamp) {
13340 // If NaNs is clamped to 0, we are free to reorder the inputs.
13341
13342 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13343 std::swap(Src0, Src1);
13344
13345 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13346 std::swap(Src1, Src2);
13347
13348 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13349 std::swap(Src0, Src1);
13350
13351 if (isClampZeroToOne(Src1, Src2))
13352 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13353 }
13354
13355 return SDValue();
13356}
13357
13358SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13359 DAGCombinerInfo &DCI) const {
13360 SDValue Src0 = N->getOperand(0);
13361 SDValue Src1 = N->getOperand(1);
13362 if (Src0.isUndef() && Src1.isUndef())
13363 return DCI.DAG.getUNDEF(N->getValueType(0));
13364 return SDValue();
13365}
13366
13367// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13368// expanded into a set of cmp/select instructions.
13370 unsigned NumElem,
13371 bool IsDivergentIdx,
13372 const GCNSubtarget *Subtarget) {
13374 return false;
13375
13376 unsigned VecSize = EltSize * NumElem;
13377
13378 // Sub-dword vectors of size 2 dword or less have better implementation.
13379 if (VecSize <= 64 && EltSize < 32)
13380 return false;
13381
13382 // Always expand the rest of sub-dword instructions, otherwise it will be
13383 // lowered via memory.
13384 if (EltSize < 32)
13385 return true;
13386
13387 // Always do this if var-idx is divergent, otherwise it will become a loop.
13388 if (IsDivergentIdx)
13389 return true;
13390
13391 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13392 unsigned NumInsts = NumElem /* Number of compares */ +
13393 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13394
13395 // On some architectures (GFX9) movrel is not available and it's better
13396 // to expand.
13397 if (Subtarget->useVGPRIndexMode())
13398 return NumInsts <= 16;
13399
13400 // If movrel is available, use it instead of expanding for vector of 8
13401 // elements.
13402 if (Subtarget->hasMovrel())
13403 return NumInsts <= 15;
13404
13405 return true;
13406}
13407
13409 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13410 if (isa<ConstantSDNode>(Idx))
13411 return false;
13412
13413 SDValue Vec = N->getOperand(0);
13414 EVT VecVT = Vec.getValueType();
13415 EVT EltVT = VecVT.getVectorElementType();
13416 unsigned EltSize = EltVT.getSizeInBits();
13417 unsigned NumElem = VecVT.getVectorNumElements();
13418
13420 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13421}
13422
13423SDValue SITargetLowering::performExtractVectorEltCombine(
13424 SDNode *N, DAGCombinerInfo &DCI) const {
13425 SDValue Vec = N->getOperand(0);
13426 SelectionDAG &DAG = DCI.DAG;
13427
13428 EVT VecVT = Vec.getValueType();
13429 EVT VecEltVT = VecVT.getVectorElementType();
13430 EVT ResVT = N->getValueType(0);
13431
13432 unsigned VecSize = VecVT.getSizeInBits();
13433 unsigned VecEltSize = VecEltVT.getSizeInBits();
13434
13435 if ((Vec.getOpcode() == ISD::FNEG ||
13436 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
13437 SDLoc SL(N);
13438 SDValue Idx = N->getOperand(1);
13439 SDValue Elt =
13440 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13441 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13442 }
13443
13444 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13445 // =>
13446 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13447 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13448 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13449 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13450 SDLoc SL(N);
13451 SDValue Idx = N->getOperand(1);
13452 unsigned Opc = Vec.getOpcode();
13453
13454 switch(Opc) {
13455 default:
13456 break;
13457 // TODO: Support other binary operations.
13458 case ISD::FADD:
13459 case ISD::FSUB:
13460 case ISD::FMUL:
13461 case ISD::ADD:
13462 case ISD::UMIN:
13463 case ISD::UMAX:
13464 case ISD::SMIN:
13465 case ISD::SMAX:
13466 case ISD::FMAXNUM:
13467 case ISD::FMINNUM:
13468 case ISD::FMAXNUM_IEEE:
13469 case ISD::FMINNUM_IEEE:
13470 case ISD::FMAXIMUM:
13471 case ISD::FMINIMUM: {
13472 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13473 Vec.getOperand(0), Idx);
13474 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13475 Vec.getOperand(1), Idx);
13476
13477 DCI.AddToWorklist(Elt0.getNode());
13478 DCI.AddToWorklist(Elt1.getNode());
13479 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13480 }
13481 }
13482 }
13483
13484 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13486 SDLoc SL(N);
13487 SDValue Idx = N->getOperand(1);
13488 SDValue V;
13489 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13490 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13491 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13492 if (I == 0)
13493 V = Elt;
13494 else
13495 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13496 }
13497 return V;
13498 }
13499
13500 if (!DCI.isBeforeLegalize())
13501 return SDValue();
13502
13503 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13504 // elements. This exposes more load reduction opportunities by replacing
13505 // multiple small extract_vector_elements with a single 32-bit extract.
13506 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13507 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13508 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13509 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13510
13511 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13512 unsigned EltIdx = BitIndex / 32;
13513 unsigned LeftoverBitIdx = BitIndex % 32;
13514 SDLoc SL(N);
13515
13516 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13517 DCI.AddToWorklist(Cast.getNode());
13518
13519 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13520 DAG.getConstant(EltIdx, SL, MVT::i32));
13521 DCI.AddToWorklist(Elt.getNode());
13522 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13523 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13524 DCI.AddToWorklist(Srl.getNode());
13525
13526 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13527 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13528 DCI.AddToWorklist(Trunc.getNode());
13529
13530 if (VecEltVT == ResVT) {
13531 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13532 }
13533
13534 assert(ResVT.isScalarInteger());
13535 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13536 }
13537
13538 return SDValue();
13539}
13540
13541SDValue
13542SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13543 DAGCombinerInfo &DCI) const {
13544 SDValue Vec = N->getOperand(0);
13545 SDValue Idx = N->getOperand(2);
13546 EVT VecVT = Vec.getValueType();
13547 EVT EltVT = VecVT.getVectorElementType();
13548
13549 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13550 // => BUILD_VECTOR n x select (e, const-idx)
13552 return SDValue();
13553
13554 SelectionDAG &DAG = DCI.DAG;
13555 SDLoc SL(N);
13556 SDValue Ins = N->getOperand(1);
13557 EVT IdxVT = Idx.getValueType();
13558
13560 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13561 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13562 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13563 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13564 Ops.push_back(V);
13565 }
13566
13567 return DAG.getBuildVector(VecVT, SL, Ops);
13568}
13569
13570/// Return the source of an fp_extend from f16 to f32, or a converted FP
13571/// constant.
13573 if (Src.getOpcode() == ISD::FP_EXTEND &&
13574 Src.getOperand(0).getValueType() == MVT::f16) {
13575 return Src.getOperand(0);
13576 }
13577
13578 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13579 APFloat Val = CFP->getValueAPF();
13580 bool LosesInfo = true;
13582 if (!LosesInfo)
13583 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13584 }
13585
13586 return SDValue();
13587}
13588
13589SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13590 DAGCombinerInfo &DCI) const {
13591 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13592 "combine only useful on gfx8");
13593
13594 SDValue TruncSrc = N->getOperand(0);
13595 EVT VT = N->getValueType(0);
13596 if (VT != MVT::f16)
13597 return SDValue();
13598
13599 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13600 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13601 return SDValue();
13602
13603 SelectionDAG &DAG = DCI.DAG;
13604 SDLoc SL(N);
13605
13606 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13607 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13608 // casting back.
13609
13610 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13611 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13612 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13613 if (!A)
13614 return SDValue();
13615
13616 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13617 if (!B)
13618 return SDValue();
13619
13620 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13621 if (!C)
13622 return SDValue();
13623
13624 // This changes signaling nan behavior. If an input is a signaling nan, it
13625 // would have been quieted by the fpext originally. We don't care because
13626 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13627 // we would be worse off than just doing the promotion.
13628 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13629 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13630 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13631 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13632}
13633
13634unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13635 const SDNode *N0,
13636 const SDNode *N1) const {
13637 EVT VT = N0->getValueType(0);
13638
13639 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13640 // support denormals ever.
13641 if (((VT == MVT::f32 &&
13643 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13646 return ISD::FMAD;
13647
13648 const TargetOptions &Options = DAG.getTarget().Options;
13649 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13650 (N0->getFlags().hasAllowContract() &&
13651 N1->getFlags().hasAllowContract())) &&
13653 return ISD::FMA;
13654 }
13655
13656 return 0;
13657}
13658
13659// For a reassociatable opcode perform:
13660// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13661SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13662 SelectionDAG &DAG) const {
13663 EVT VT = N->getValueType(0);
13664 if (VT != MVT::i32 && VT != MVT::i64)
13665 return SDValue();
13666
13667 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13668 return SDValue();
13669
13670 unsigned Opc = N->getOpcode();
13671 SDValue Op0 = N->getOperand(0);
13672 SDValue Op1 = N->getOperand(1);
13673
13674 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13675 return SDValue();
13676
13677 if (Op0->isDivergent())
13678 std::swap(Op0, Op1);
13679
13680 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13681 return SDValue();
13682
13683 SDValue Op2 = Op1.getOperand(1);
13684 Op1 = Op1.getOperand(0);
13685 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13686 return SDValue();
13687
13688 if (Op1->isDivergent())
13689 std::swap(Op1, Op2);
13690
13691 SDLoc SL(N);
13692 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13693 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13694}
13695
13696static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13697 EVT VT,
13698 SDValue N0, SDValue N1, SDValue N2,
13699 bool Signed) {
13701 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13702 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13703 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13704}
13705
13706// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13707// multiplies, if any.
13708//
13709// Full 64-bit multiplies that feed into an addition are lowered here instead
13710// of using the generic expansion. The generic expansion ends up with
13711// a tree of ADD nodes that prevents us from using the "add" part of the
13712// MAD instruction. The expansion produced here results in a chain of ADDs
13713// instead of a tree.
13714SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13715 DAGCombinerInfo &DCI) const {
13716 assert(N->getOpcode() == ISD::ADD);
13717
13718 SelectionDAG &DAG = DCI.DAG;
13719 EVT VT = N->getValueType(0);
13720 SDLoc SL(N);
13721 SDValue LHS = N->getOperand(0);
13722 SDValue RHS = N->getOperand(1);
13723
13724 if (VT.isVector())
13725 return SDValue();
13726
13727 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13728 // result in scalar registers for uniform values.
13729 if (!N->isDivergent() && Subtarget->hasSMulHi())
13730 return SDValue();
13731
13732 unsigned NumBits = VT.getScalarSizeInBits();
13733 if (NumBits <= 32 || NumBits > 64)
13734 return SDValue();
13735
13736 if (LHS.getOpcode() != ISD::MUL) {
13737 assert(RHS.getOpcode() == ISD::MUL);
13738 std::swap(LHS, RHS);
13739 }
13740
13741 // Avoid the fold if it would unduly increase the number of multiplies due to
13742 // multiple uses, except on hardware with full-rate multiply-add (which is
13743 // part of full-rate 64-bit ops).
13744 if (!Subtarget->hasFullRate64Ops()) {
13745 unsigned NumUsers = 0;
13746 for (SDNode *Use : LHS->uses()) {
13747 // There is a use that does not feed into addition, so the multiply can't
13748 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13749 if (Use->getOpcode() != ISD::ADD)
13750 return SDValue();
13751
13752 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13753 // MUL + 3xADD + 3xADDC over 3xMAD.
13754 ++NumUsers;
13755 if (NumUsers >= 3)
13756 return SDValue();
13757 }
13758 }
13759
13760 SDValue MulLHS = LHS.getOperand(0);
13761 SDValue MulRHS = LHS.getOperand(1);
13762 SDValue AddRHS = RHS;
13763
13764 // Always check whether operands are small unsigned values, since that
13765 // knowledge is useful in more cases. Check for small signed values only if
13766 // doing so can unlock a shorter code sequence.
13767 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13768 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13769
13770 bool MulSignedLo = false;
13771 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13772 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13773 numBitsSigned(MulRHS, DAG) <= 32;
13774 }
13775
13776 // The operands and final result all have the same number of bits. If
13777 // operands need to be extended, they can be extended with garbage. The
13778 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13779 // truncated away in the end.
13780 if (VT != MVT::i64) {
13781 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13782 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13783 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13784 }
13785
13786 // The basic code generated is conceptually straightforward. Pseudo code:
13787 //
13788 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13789 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13790 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13791 //
13792 // The second and third lines are optional, depending on whether the factors
13793 // are {sign,zero}-extended or not.
13794 //
13795 // The actual DAG is noisier than the pseudo code, but only due to
13796 // instructions that disassemble values into low and high parts, and
13797 // assemble the final result.
13798 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13799
13800 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13801 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13802 SDValue Accum =
13803 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13804
13805 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13806 SDValue AccumLo, AccumHi;
13807 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13808
13809 if (!MulLHSUnsigned32) {
13810 auto MulLHSHi =
13811 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13812 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13813 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13814 }
13815
13816 if (!MulRHSUnsigned32) {
13817 auto MulRHSHi =
13818 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13819 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13820 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13821 }
13822
13823 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13824 Accum = DAG.getBitcast(MVT::i64, Accum);
13825 }
13826
13827 if (VT != MVT::i64)
13828 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13829 return Accum;
13830}
13831
13832// Collect the ultimate src of each of the mul node's operands, and confirm
13833// each operand is 8 bytes.
13834static std::optional<ByteProvider<SDValue>>
13835handleMulOperand(const SDValue &MulOperand) {
13836 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13837 if (!Byte0 || Byte0->isConstantZero()) {
13838 return std::nullopt;
13839 }
13840 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13841 if (Byte1 && !Byte1->isConstantZero()) {
13842 return std::nullopt;
13843 }
13844 return Byte0;
13845}
13846
13847static unsigned addPermMasks(unsigned First, unsigned Second) {
13848 unsigned FirstCs = First & 0x0c0c0c0c;
13849 unsigned SecondCs = Second & 0x0c0c0c0c;
13850 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13851 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13852
13853 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13854 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13855 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13856 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13857
13858 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13859}
13860
13861struct DotSrc {
13863 int64_t PermMask;
13865};
13866
13870 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13871
13872 assert(Src0.Src.has_value() && Src1.Src.has_value());
13873 // Src0s and Src1s are empty, just place arbitrarily.
13874 if (Step == 0) {
13875 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13876 Src0.SrcOffset / 4});
13877 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13878 Src1.SrcOffset / 4});
13879 return;
13880 }
13881
13882 for (int BPI = 0; BPI < 2; BPI++) {
13883 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13884 if (BPI == 1) {
13885 BPP = {Src1, Src0};
13886 }
13887 unsigned ZeroMask = 0x0c0c0c0c;
13888 unsigned FMask = 0xFF << (8 * (3 - Step));
13889
13890 unsigned FirstMask =
13891 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13892 unsigned SecondMask =
13893 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13894 // Attempt to find Src vector which contains our SDValue, if so, add our
13895 // perm mask to the existing one. If we are unable to find a match for the
13896 // first SDValue, attempt to find match for the second.
13897 int FirstGroup = -1;
13898 for (int I = 0; I < 2; I++) {
13899 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13900 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13901 return IterElt.SrcOp == *BPP.first.Src &&
13902 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13903 };
13904
13905 auto Match = llvm::find_if(Srcs, MatchesFirst);
13906 if (Match != Srcs.end()) {
13907 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13908 FirstGroup = I;
13909 break;
13910 }
13911 }
13912 if (FirstGroup != -1) {
13913 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13914 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13915 return IterElt.SrcOp == *BPP.second.Src &&
13916 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13917 };
13918 auto Match = llvm::find_if(Srcs, MatchesSecond);
13919 if (Match != Srcs.end()) {
13920 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13921 } else
13922 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13923 return;
13924 }
13925 }
13926
13927 // If we have made it here, then we could not find a match in Src0s or Src1s
13928 // for either Src0 or Src1, so just place them arbitrarily.
13929
13930 unsigned ZeroMask = 0x0c0c0c0c;
13931 unsigned FMask = 0xFF << (8 * (3 - Step));
13932
13933 Src0s.push_back(
13934 {*Src0.Src,
13935 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13936 Src1.SrcOffset / 4});
13937 Src1s.push_back(
13938 {*Src1.Src,
13939 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13940 Src1.SrcOffset / 4});
13941
13942 return;
13943}
13944
13946 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13947 bool IsAny) {
13948
13949 // If we just have one source, just permute it accordingly.
13950 if (Srcs.size() == 1) {
13951 auto Elt = Srcs.begin();
13952 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13953
13954 // v_perm will produce the original value
13955 if (Elt->PermMask == 0x3020100)
13956 return EltOp;
13957
13958 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13959 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13960 }
13961
13962 auto FirstElt = Srcs.begin();
13963 auto SecondElt = std::next(FirstElt);
13964
13966
13967 // If we have multiple sources in the chain, combine them via perms (using
13968 // calculated perm mask) and Ors.
13969 while (true) {
13970 auto FirstMask = FirstElt->PermMask;
13971 auto SecondMask = SecondElt->PermMask;
13972
13973 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13974 unsigned FirstPlusFour = FirstMask | 0x04040404;
13975 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13976 // original 0x0C.
13977 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13978
13979 auto PermMask = addPermMasks(FirstMask, SecondMask);
13980 auto FirstVal =
13981 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13982 auto SecondVal =
13983 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13984
13985 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13986 SecondVal,
13987 DAG.getConstant(PermMask, SL, MVT::i32)));
13988
13989 FirstElt = std::next(SecondElt);
13990 if (FirstElt == Srcs.end())
13991 break;
13992
13993 SecondElt = std::next(FirstElt);
13994 // If we only have a FirstElt, then just combine that into the cumulative
13995 // source node.
13996 if (SecondElt == Srcs.end()) {
13997 auto EltOp =
13998 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13999
14000 Perms.push_back(
14001 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
14002 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
14003 break;
14004 }
14005 }
14006
14007 assert(Perms.size() == 1 || Perms.size() == 2);
14008 return Perms.size() == 2
14009 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
14010 : Perms[0];
14011}
14012
14013static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
14014 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
14015 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
14016 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
14017 EntryMask += ZeroMask;
14018 }
14019}
14020
14021static bool isMul(const SDValue Op) {
14022 auto Opcode = Op.getOpcode();
14023
14024 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
14025 Opcode == AMDGPUISD::MUL_I24);
14026}
14027
14028static std::optional<bool>
14030 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14031 const SDValue &S1Op, const SelectionDAG &DAG) {
14032 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14033 // of the dot4 is irrelevant.
14034 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14035 return false;
14036
14037 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14038 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14039 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14040 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14041 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14042 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14043
14044 assert(!(S0IsUnsigned && S0IsSigned));
14045 assert(!(S1IsUnsigned && S1IsSigned));
14046
14047 // There are 9 possible permutations of
14048 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14049
14050 // In two permutations, the sign bits are known to be the same for both Ops,
14051 // so simply return Signed / Unsigned corresponding to the MSB
14052
14053 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14054 return S0IsSigned;
14055
14056 // In another two permutations, the sign bits are known to be opposite. In
14057 // this case return std::nullopt to indicate a bad match.
14058
14059 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14060 return std::nullopt;
14061
14062 // In the remaining five permutations, we don't know the value of the sign
14063 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14064 // the upper bits must be extension bits. Thus, the only ways for the sign
14065 // bit to be unknown is if it was sign extended from unknown value, or if it
14066 // was any extended. In either case, it is correct to use the signed
14067 // version of the signedness semantics of dot4
14068
14069 // In two of such permutations, we known the sign bit is set for
14070 // one op, and the other is unknown. It is okay to used signed version of
14071 // dot4.
14072 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14073 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14074 return true;
14075
14076 // In one such permutation, we don't know either of the sign bits. It is okay
14077 // to used the signed version of dot4.
14078 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14079 return true;
14080
14081 // In two of such permutations, we known the sign bit is unset for
14082 // one op, and the other is unknown. Return std::nullopt to indicate a
14083 // bad match.
14084 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14085 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14086 return std::nullopt;
14087
14088 llvm_unreachable("Fully covered condition");
14089}
14090
14091SDValue SITargetLowering::performAddCombine(SDNode *N,
14092 DAGCombinerInfo &DCI) const {
14093 SelectionDAG &DAG = DCI.DAG;
14094 EVT VT = N->getValueType(0);
14095 SDLoc SL(N);
14096 SDValue LHS = N->getOperand(0);
14097 SDValue RHS = N->getOperand(1);
14098
14099 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14100 if (Subtarget->hasMad64_32()) {
14101 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14102 return Folded;
14103 }
14104 }
14105
14106 if (SDValue V = reassociateScalarOps(N, DAG)) {
14107 return V;
14108 }
14109
14110 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14111 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14112 SDValue TempNode(N, 0);
14113 std::optional<bool> IsSigned;
14117
14118 // Match the v_dot4 tree, while collecting src nodes.
14119 int ChainLength = 0;
14120 for (int I = 0; I < 4; I++) {
14121 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14122 if (MulIdx == -1)
14123 break;
14124 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14125 if (!Src0)
14126 break;
14127 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14128 if (!Src1)
14129 break;
14130
14131 auto IterIsSigned = checkDot4MulSignedness(
14132 TempNode->getOperand(MulIdx), *Src0, *Src1,
14133 TempNode->getOperand(MulIdx)->getOperand(0),
14134 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14135 if (!IterIsSigned)
14136 break;
14137 if (!IsSigned)
14138 IsSigned = *IterIsSigned;
14139 if (*IterIsSigned != *IsSigned)
14140 break;
14141 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14142 auto AddIdx = 1 - MulIdx;
14143 // Allow the special case where add (add (mul24, 0), mul24) became ->
14144 // add (mul24, mul24).
14145 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14146 Src2s.push_back(TempNode->getOperand(AddIdx));
14147 auto Src0 =
14148 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14149 if (!Src0)
14150 break;
14151 auto Src1 =
14152 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14153 if (!Src1)
14154 break;
14155 auto IterIsSigned = checkDot4MulSignedness(
14156 TempNode->getOperand(AddIdx), *Src0, *Src1,
14157 TempNode->getOperand(AddIdx)->getOperand(0),
14158 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14159 if (!IterIsSigned)
14160 break;
14161 assert(IsSigned);
14162 if (*IterIsSigned != *IsSigned)
14163 break;
14164 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14165 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14166 ChainLength = I + 2;
14167 break;
14168 }
14169
14170 TempNode = TempNode->getOperand(AddIdx);
14171 Src2s.push_back(TempNode);
14172 ChainLength = I + 1;
14173 if (TempNode->getNumOperands() < 2)
14174 break;
14175 LHS = TempNode->getOperand(0);
14176 RHS = TempNode->getOperand(1);
14177 }
14178
14179 if (ChainLength < 2)
14180 return SDValue();
14181
14182 // Masks were constructed with assumption that we would find a chain of
14183 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14184 // 0x0c) so they do not affect dot calculation.
14185 if (ChainLength < 4) {
14186 fixMasks(Src0s, ChainLength);
14187 fixMasks(Src1s, ChainLength);
14188 }
14189
14190 SDValue Src0, Src1;
14191
14192 // If we are just using a single source for both, and have permuted the
14193 // bytes consistently, we can just use the sources without permuting
14194 // (commutation).
14195 bool UseOriginalSrc = false;
14196 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14197 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14198 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14199 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14200 SmallVector<unsigned, 4> SrcBytes;
14201 auto Src0Mask = Src0s.begin()->PermMask;
14202 SrcBytes.push_back(Src0Mask & 0xFF000000);
14203 bool UniqueEntries = true;
14204 for (auto I = 1; I < 4; I++) {
14205 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14206
14207 if (is_contained(SrcBytes, NextByte)) {
14208 UniqueEntries = false;
14209 break;
14210 }
14211 SrcBytes.push_back(NextByte);
14212 }
14213
14214 if (UniqueEntries) {
14215 UseOriginalSrc = true;
14216
14217 auto FirstElt = Src0s.begin();
14218 auto FirstEltOp =
14219 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14220
14221 auto SecondElt = Src1s.begin();
14222 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14223 SecondElt->DWordOffset);
14224
14225 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14226 MVT::getIntegerVT(32));
14227 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14228 MVT::getIntegerVT(32));
14229 }
14230 }
14231
14232 if (!UseOriginalSrc) {
14233 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14234 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14235 }
14236
14237 assert(IsSigned);
14238 SDValue Src2 =
14239 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14240
14241 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14242 : Intrinsic::amdgcn_udot4,
14243 SL, MVT::i64);
14244
14245 assert(!VT.isVector());
14246 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14247 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14248
14249 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14250 }
14251
14252 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14253 return SDValue();
14254
14255 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14256 // add x, sext (setcc) => usubo_carry x, 0, setcc
14257 unsigned Opc = LHS.getOpcode();
14258 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14259 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14260 std::swap(RHS, LHS);
14261
14262 Opc = RHS.getOpcode();
14263 switch (Opc) {
14264 default: break;
14265 case ISD::ZERO_EXTEND:
14266 case ISD::SIGN_EXTEND:
14267 case ISD::ANY_EXTEND: {
14268 auto Cond = RHS.getOperand(0);
14269 // If this won't be a real VOPC output, we would still need to insert an
14270 // extra instruction anyway.
14271 if (!isBoolSGPR(Cond))
14272 break;
14273 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14274 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14275 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
14276 return DAG.getNode(Opc, SL, VTList, Args);
14277 }
14278 case ISD::UADDO_CARRY: {
14279 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14280 if (!isNullConstant(RHS.getOperand(1)))
14281 break;
14282 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14283 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14284 }
14285 }
14286 return SDValue();
14287}
14288
14289SDValue SITargetLowering::performSubCombine(SDNode *N,
14290 DAGCombinerInfo &DCI) const {
14291 SelectionDAG &DAG = DCI.DAG;
14292 EVT VT = N->getValueType(0);
14293
14294 if (VT != MVT::i32)
14295 return SDValue();
14296
14297 SDLoc SL(N);
14298 SDValue LHS = N->getOperand(0);
14299 SDValue RHS = N->getOperand(1);
14300
14301 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14302 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14303 unsigned Opc = RHS.getOpcode();
14304 switch (Opc) {
14305 default: break;
14306 case ISD::ZERO_EXTEND:
14307 case ISD::SIGN_EXTEND:
14308 case ISD::ANY_EXTEND: {
14309 auto Cond = RHS.getOperand(0);
14310 // If this won't be a real VOPC output, we would still need to insert an
14311 // extra instruction anyway.
14312 if (!isBoolSGPR(Cond))
14313 break;
14314 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14315 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14316 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
14317 return DAG.getNode(Opc, SL, VTList, Args);
14318 }
14319 }
14320
14321 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14322 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14323 if (!isNullConstant(LHS.getOperand(1)))
14324 return SDValue();
14325 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14326 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14327 }
14328 return SDValue();
14329}
14330
14331SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14332 DAGCombinerInfo &DCI) const {
14333
14334 if (N->getValueType(0) != MVT::i32)
14335 return SDValue();
14336
14337 if (!isNullConstant(N->getOperand(1)))
14338 return SDValue();
14339
14340 SelectionDAG &DAG = DCI.DAG;
14341 SDValue LHS = N->getOperand(0);
14342
14343 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14344 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14345 unsigned LHSOpc = LHS.getOpcode();
14346 unsigned Opc = N->getOpcode();
14347 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14348 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14349 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14350 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14351 }
14352 return SDValue();
14353}
14354
14355SDValue SITargetLowering::performFAddCombine(SDNode *N,
14356 DAGCombinerInfo &DCI) const {
14357 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14358 return SDValue();
14359
14360 SelectionDAG &DAG = DCI.DAG;
14361 EVT VT = N->getValueType(0);
14362
14363 SDLoc SL(N);
14364 SDValue LHS = N->getOperand(0);
14365 SDValue RHS = N->getOperand(1);
14366
14367 // These should really be instruction patterns, but writing patterns with
14368 // source modifiers is a pain.
14369
14370 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14371 if (LHS.getOpcode() == ISD::FADD) {
14372 SDValue A = LHS.getOperand(0);
14373 if (A == LHS.getOperand(1)) {
14374 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14375 if (FusedOp != 0) {
14376 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14377 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14378 }
14379 }
14380 }
14381
14382 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14383 if (RHS.getOpcode() == ISD::FADD) {
14384 SDValue A = RHS.getOperand(0);
14385 if (A == RHS.getOperand(1)) {
14386 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14387 if (FusedOp != 0) {
14388 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14389 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14390 }
14391 }
14392 }
14393
14394 return SDValue();
14395}
14396
14397SDValue SITargetLowering::performFSubCombine(SDNode *N,
14398 DAGCombinerInfo &DCI) const {
14399 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14400 return SDValue();
14401
14402 SelectionDAG &DAG = DCI.DAG;
14403 SDLoc SL(N);
14404 EVT VT = N->getValueType(0);
14405 assert(!VT.isVector());
14406
14407 // Try to get the fneg to fold into the source modifier. This undoes generic
14408 // DAG combines and folds them into the mad.
14409 //
14410 // Only do this if we are not trying to support denormals. v_mad_f32 does
14411 // not support denormals ever.
14412 SDValue LHS = N->getOperand(0);
14413 SDValue RHS = N->getOperand(1);
14414 if (LHS.getOpcode() == ISD::FADD) {
14415 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14416 SDValue A = LHS.getOperand(0);
14417 if (A == LHS.getOperand(1)) {
14418 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14419 if (FusedOp != 0){
14420 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14421 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14422
14423 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14424 }
14425 }
14426 }
14427
14428 if (RHS.getOpcode() == ISD::FADD) {
14429 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14430
14431 SDValue A = RHS.getOperand(0);
14432 if (A == RHS.getOperand(1)) {
14433 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14434 if (FusedOp != 0){
14435 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14436 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14437 }
14438 }
14439 }
14440
14441 return SDValue();
14442}
14443
14444SDValue SITargetLowering::performFDivCombine(SDNode *N,
14445 DAGCombinerInfo &DCI) const {
14446 SelectionDAG &DAG = DCI.DAG;
14447 SDLoc SL(N);
14448 EVT VT = N->getValueType(0);
14449 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14450 return SDValue();
14451
14452 SDValue LHS = N->getOperand(0);
14453 SDValue RHS = N->getOperand(1);
14454
14455 SDNodeFlags Flags = N->getFlags();
14456 SDNodeFlags RHSFlags = RHS->getFlags();
14457 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14458 !RHS->hasOneUse())
14459 return SDValue();
14460
14461 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14462 bool IsNegative = false;
14463 if (CLHS->isExactlyValue(1.0) ||
14464 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14465 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14466 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14467 if (RHS.getOpcode() == ISD::FSQRT) {
14468 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14469 SDValue Rsq =
14470 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14471 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14472 }
14473 }
14474 }
14475
14476 return SDValue();
14477}
14478
14479SDValue SITargetLowering::performFMACombine(SDNode *N,
14480 DAGCombinerInfo &DCI) const {
14481 SelectionDAG &DAG = DCI.DAG;
14482 EVT VT = N->getValueType(0);
14483 SDLoc SL(N);
14484
14485 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14486 return SDValue();
14487
14488 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14489 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14490 SDValue Op1 = N->getOperand(0);
14491 SDValue Op2 = N->getOperand(1);
14492 SDValue FMA = N->getOperand(2);
14493
14494 if (FMA.getOpcode() != ISD::FMA ||
14495 Op1.getOpcode() != ISD::FP_EXTEND ||
14496 Op2.getOpcode() != ISD::FP_EXTEND)
14497 return SDValue();
14498
14499 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14500 // regardless of the denorm mode setting. Therefore,
14501 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14502 const TargetOptions &Options = DAG.getTarget().Options;
14503 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14504 (N->getFlags().hasAllowContract() &&
14505 FMA->getFlags().hasAllowContract())) {
14506 Op1 = Op1.getOperand(0);
14507 Op2 = Op2.getOperand(0);
14508 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14510 return SDValue();
14511
14512 SDValue Vec1 = Op1.getOperand(0);
14513 SDValue Idx1 = Op1.getOperand(1);
14514 SDValue Vec2 = Op2.getOperand(0);
14515
14516 SDValue FMAOp1 = FMA.getOperand(0);
14517 SDValue FMAOp2 = FMA.getOperand(1);
14518 SDValue FMAAcc = FMA.getOperand(2);
14519
14520 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14521 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14522 return SDValue();
14523
14524 FMAOp1 = FMAOp1.getOperand(0);
14525 FMAOp2 = FMAOp2.getOperand(0);
14526 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14528 return SDValue();
14529
14530 SDValue Vec3 = FMAOp1.getOperand(0);
14531 SDValue Vec4 = FMAOp2.getOperand(0);
14532 SDValue Idx2 = FMAOp1.getOperand(1);
14533
14534 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14535 // Idx1 and Idx2 cannot be the same.
14536 Idx1 == Idx2)
14537 return SDValue();
14538
14539 if (Vec1 == Vec2 || Vec3 == Vec4)
14540 return SDValue();
14541
14542 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14543 return SDValue();
14544
14545 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14546 (Vec1 == Vec4 && Vec2 == Vec3)) {
14547 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14548 DAG.getTargetConstant(0, SL, MVT::i1));
14549 }
14550 }
14551 return SDValue();
14552}
14553
14554SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14555 DAGCombinerInfo &DCI) const {
14556 SelectionDAG &DAG = DCI.DAG;
14557 SDLoc SL(N);
14558
14559 SDValue LHS = N->getOperand(0);
14560 SDValue RHS = N->getOperand(1);
14561 EVT VT = LHS.getValueType();
14562 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14563
14564 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14565 if (!CRHS) {
14566 CRHS = dyn_cast<ConstantSDNode>(LHS);
14567 if (CRHS) {
14568 std::swap(LHS, RHS);
14570 }
14571 }
14572
14573 if (CRHS) {
14574 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14575 isBoolSGPR(LHS.getOperand(0))) {
14576 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14577 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14578 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14579 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14580 if ((CRHS->isAllOnes() &&
14581 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14582 (CRHS->isZero() &&
14583 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14584 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14585 DAG.getConstant(-1, SL, MVT::i1));
14586 if ((CRHS->isAllOnes() &&
14587 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14588 (CRHS->isZero() &&
14589 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14590 return LHS.getOperand(0);
14591 }
14592
14593 const APInt &CRHSVal = CRHS->getAPIntValue();
14594 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14595 LHS.getOpcode() == ISD::SELECT &&
14596 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14597 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14598 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14599 isBoolSGPR(LHS.getOperand(0))) {
14600 // Given CT != FT:
14601 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14602 // setcc (select cc, CT, CF), CF, ne => cc
14603 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14604 // setcc (select cc, CT, CF), CT, eq => cc
14605 const APInt &CT = LHS.getConstantOperandAPInt(1);
14606 const APInt &CF = LHS.getConstantOperandAPInt(2);
14607
14608 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14609 (CT == CRHSVal && CC == ISD::SETNE))
14610 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14611 DAG.getConstant(-1, SL, MVT::i1));
14612 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14613 (CT == CRHSVal && CC == ISD::SETEQ))
14614 return LHS.getOperand(0);
14615 }
14616 }
14617
14618 if (VT != MVT::f32 && VT != MVT::f64 &&
14619 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14620 return SDValue();
14621
14622 // Match isinf/isfinite pattern
14623 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14624 // (fcmp one (fabs x), inf) -> (fp_class x,
14625 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14626 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14628 if (!CRHS)
14629 return SDValue();
14630
14631 const APFloat &APF = CRHS->getValueAPF();
14632 if (APF.isInfinity() && !APF.isNegative()) {
14633 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14635 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14641 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14642 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14643 DAG.getConstant(Mask, SL, MVT::i32));
14644 }
14645 }
14646
14647 return SDValue();
14648}
14649
14650SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14651 DAGCombinerInfo &DCI) const {
14652 SelectionDAG &DAG = DCI.DAG;
14653 SDLoc SL(N);
14654 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14655
14656 SDValue Src = N->getOperand(0);
14657 SDValue Shift = N->getOperand(0);
14658
14659 // TODO: Extend type shouldn't matter (assuming legal types).
14660 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14661 Shift = Shift.getOperand(0);
14662
14663 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14664 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14665 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14666 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14667 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14668 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14669 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14670 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14671 SDLoc(Shift.getOperand(0)), MVT::i32);
14672
14673 unsigned ShiftOffset = 8 * Offset;
14674 if (Shift.getOpcode() == ISD::SHL)
14675 ShiftOffset -= C->getZExtValue();
14676 else
14677 ShiftOffset += C->getZExtValue();
14678
14679 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14680 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14681 MVT::f32, Shifted);
14682 }
14683 }
14684 }
14685
14686 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14687 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14688 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14689 // We simplified Src. If this node is not dead, visit it again so it is
14690 // folded properly.
14691 if (N->getOpcode() != ISD::DELETED_NODE)
14692 DCI.AddToWorklist(N);
14693 return SDValue(N, 0);
14694 }
14695
14696 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14697 if (SDValue DemandedSrc =
14699 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14700
14701 return SDValue();
14702}
14703
14704SDValue SITargetLowering::performClampCombine(SDNode *N,
14705 DAGCombinerInfo &DCI) const {
14706 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14707 if (!CSrc)
14708 return SDValue();
14709
14710 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14711 const APFloat &F = CSrc->getValueAPF();
14712 APFloat Zero = APFloat::getZero(F.getSemantics());
14713 if (F < Zero ||
14714 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14715 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14716 }
14717
14718 APFloat One(F.getSemantics(), "1.0");
14719 if (F > One)
14720 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14721
14722 return SDValue(CSrc, 0);
14723}
14724
14725
14727 DAGCombinerInfo &DCI) const {
14728 switch (N->getOpcode()) {
14729 case ISD::ADD:
14730 case ISD::SUB:
14731 case ISD::SHL:
14732 case ISD::SRL:
14733 case ISD::SRA:
14734 case ISD::AND:
14735 case ISD::OR:
14736 case ISD::XOR:
14737 case ISD::MUL:
14738 case ISD::SETCC:
14739 case ISD::SELECT:
14740 case ISD::SMIN:
14741 case ISD::SMAX:
14742 case ISD::UMIN:
14743 case ISD::UMAX:
14744 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
14745 return Res;
14746 break;
14747 default:
14748 break;
14749 }
14750
14751 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14752 return SDValue();
14753
14754 switch (N->getOpcode()) {
14755 case ISD::ADD:
14756 return performAddCombine(N, DCI);
14757 case ISD::SUB:
14758 return performSubCombine(N, DCI);
14759 case ISD::UADDO_CARRY:
14760 case ISD::USUBO_CARRY:
14761 return performAddCarrySubCarryCombine(N, DCI);
14762 case ISD::FADD:
14763 return performFAddCombine(N, DCI);
14764 case ISD::FSUB:
14765 return performFSubCombine(N, DCI);
14766 case ISD::FDIV:
14767 return performFDivCombine(N, DCI);
14768 case ISD::SETCC:
14769 return performSetCCCombine(N, DCI);
14770 case ISD::FMAXNUM:
14771 case ISD::FMINNUM:
14772 case ISD::FMAXNUM_IEEE:
14773 case ISD::FMINNUM_IEEE:
14774 case ISD::FMAXIMUM:
14775 case ISD::FMINIMUM:
14776 case ISD::SMAX:
14777 case ISD::SMIN:
14778 case ISD::UMAX:
14779 case ISD::UMIN:
14782 return performMinMaxCombine(N, DCI);
14783 case ISD::FMA:
14784 return performFMACombine(N, DCI);
14785 case ISD::AND:
14786 return performAndCombine(N, DCI);
14787 case ISD::OR:
14788 return performOrCombine(N, DCI);
14789 case ISD::FSHR: {
14791 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14792 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14793 return matchPERM(N, DCI);
14794 }
14795 break;
14796 }
14797 case ISD::XOR:
14798 return performXorCombine(N, DCI);
14799 case ISD::ZERO_EXTEND:
14800 return performZeroExtendCombine(N, DCI);
14802 return performSignExtendInRegCombine(N , DCI);
14804 return performClassCombine(N, DCI);
14805 case ISD::FCANONICALIZE:
14806 return performFCanonicalizeCombine(N, DCI);
14807 case AMDGPUISD::RCP:
14808 return performRcpCombine(N, DCI);
14809 case ISD::FLDEXP:
14810 case AMDGPUISD::FRACT:
14811 case AMDGPUISD::RSQ:
14814 case AMDGPUISD::RSQ_CLAMP: {
14815 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14816 SDValue Src = N->getOperand(0);
14817 if (Src.isUndef())
14818 return Src;
14819 break;
14820 }
14821 case ISD::SINT_TO_FP:
14822 case ISD::UINT_TO_FP:
14823 return performUCharToFloatCombine(N, DCI);
14824 case ISD::FCOPYSIGN:
14825 return performFCopySignCombine(N, DCI);
14830 return performCvtF32UByteNCombine(N, DCI);
14831 case AMDGPUISD::FMED3:
14832 return performFMed3Combine(N, DCI);
14834 return performCvtPkRTZCombine(N, DCI);
14835 case AMDGPUISD::CLAMP:
14836 return performClampCombine(N, DCI);
14837 case ISD::SCALAR_TO_VECTOR: {
14838 SelectionDAG &DAG = DCI.DAG;
14839 EVT VT = N->getValueType(0);
14840
14841 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14842 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14843 SDLoc SL(N);
14844 SDValue Src = N->getOperand(0);
14845 EVT EltVT = Src.getValueType();
14846 if (EltVT != MVT::i16)
14847 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14848
14849 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14850 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14851 }
14852
14853 break;
14854 }
14856 return performExtractVectorEltCombine(N, DCI);
14858 return performInsertVectorEltCombine(N, DCI);
14859 case ISD::FP_ROUND:
14860 return performFPRoundCombine(N, DCI);
14861 case ISD::LOAD: {
14862 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14863 return Widened;
14864 [[fallthrough]];
14865 }
14866 default: {
14867 if (!DCI.isBeforeLegalize()) {
14868 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14869 return performMemSDNodeCombine(MemNode, DCI);
14870 }
14871
14872 break;
14873 }
14874 }
14875
14877}
14878
14879/// Helper function for adjustWritemask
14880static unsigned SubIdx2Lane(unsigned Idx) {
14881 switch (Idx) {
14882 default: return ~0u;
14883 case AMDGPU::sub0: return 0;
14884 case AMDGPU::sub1: return 1;
14885 case AMDGPU::sub2: return 2;
14886 case AMDGPU::sub3: return 3;
14887 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14888 }
14889}
14890
14891/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14892SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14893 SelectionDAG &DAG) const {
14894 unsigned Opcode = Node->getMachineOpcode();
14895
14896 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14897 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14898 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14899 return Node; // not implemented for D16
14900
14901 SDNode *Users[5] = { nullptr };
14902 unsigned Lane = 0;
14903 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14904 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14905 unsigned NewDmask = 0;
14906 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14907 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14908 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14909 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14910 ? true
14911 : false;
14912 unsigned TFCLane = 0;
14913 bool HasChain = Node->getNumValues() > 1;
14914
14915 if (OldDmask == 0) {
14916 // These are folded out, but on the chance it happens don't assert.
14917 return Node;
14918 }
14919
14920 unsigned OldBitsSet = llvm::popcount(OldDmask);
14921 // Work out which is the TFE/LWE lane if that is enabled.
14922 if (UsesTFC) {
14923 TFCLane = OldBitsSet;
14924 }
14925
14926 // Try to figure out the used register components
14927 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14928 I != E; ++I) {
14929
14930 // Don't look at users of the chain.
14931 if (I.getUse().getResNo() != 0)
14932 continue;
14933
14934 // Abort if we can't understand the usage
14935 if (!I->isMachineOpcode() ||
14936 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14937 return Node;
14938
14939 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14940 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14941 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14942 // set, etc.
14943 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14944 if (Lane == ~0u)
14945 return Node;
14946
14947 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14948 if (UsesTFC && Lane == TFCLane) {
14949 Users[Lane] = *I;
14950 } else {
14951 // Set which texture component corresponds to the lane.
14952 unsigned Comp;
14953 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14954 Comp = llvm::countr_zero(Dmask);
14955 Dmask &= ~(1 << Comp);
14956 }
14957
14958 // Abort if we have more than one user per component.
14959 if (Users[Lane])
14960 return Node;
14961
14962 Users[Lane] = *I;
14963 NewDmask |= 1 << Comp;
14964 }
14965 }
14966
14967 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14968 bool NoChannels = !NewDmask;
14969 if (NoChannels) {
14970 if (!UsesTFC) {
14971 // No uses of the result and not using TFC. Then do nothing.
14972 return Node;
14973 }
14974 // If the original dmask has one channel - then nothing to do
14975 if (OldBitsSet == 1)
14976 return Node;
14977 // Use an arbitrary dmask - required for the instruction to work
14978 NewDmask = 1;
14979 }
14980 // Abort if there's no change
14981 if (NewDmask == OldDmask)
14982 return Node;
14983
14984 unsigned BitsSet = llvm::popcount(NewDmask);
14985
14986 // Check for TFE or LWE - increase the number of channels by one to account
14987 // for the extra return value
14988 // This will need adjustment for D16 if this is also included in
14989 // adjustWriteMask (this function) but at present D16 are excluded.
14990 unsigned NewChannels = BitsSet + UsesTFC;
14991
14992 int NewOpcode =
14993 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14994 assert(NewOpcode != -1 &&
14995 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14996 "failed to find equivalent MIMG op");
14997
14998 // Adjust the writemask in the node
15000 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
15001 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
15002 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
15003
15004 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
15005
15006 MVT ResultVT = NewChannels == 1 ?
15007 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
15008 NewChannels == 5 ? 8 : NewChannels);
15009 SDVTList NewVTList = HasChain ?
15010 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
15011
15012
15013 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
15014 NewVTList, Ops);
15015
15016 if (HasChain) {
15017 // Update chain.
15018 DAG.setNodeMemRefs(NewNode, Node->memoperands());
15019 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
15020 }
15021
15022 if (NewChannels == 1) {
15023 assert(Node->hasNUsesOfValue(1, 0));
15024 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
15025 SDLoc(Node), Users[Lane]->getValueType(0),
15026 SDValue(NewNode, 0));
15027 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
15028 return nullptr;
15029 }
15030
15031 // Update the users of the node with the new indices
15032 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
15033 SDNode *User = Users[i];
15034 if (!User) {
15035 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
15036 // Users[0] is still nullptr because channel 0 doesn't really have a use.
15037 if (i || !NoChannels)
15038 continue;
15039 } else {
15040 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
15041 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
15042 if (NewUser != User) {
15043 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
15044 DAG.RemoveDeadNode(User);
15045 }
15046 }
15047
15048 switch (Idx) {
15049 default: break;
15050 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
15051 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
15052 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
15053 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
15054 }
15055 }
15056
15057 DAG.RemoveDeadNode(Node);
15058 return nullptr;
15059}
15060
15062 if (Op.getOpcode() == ISD::AssertZext)
15063 Op = Op.getOperand(0);
15064
15065 return isa<FrameIndexSDNode>(Op);
15066}
15067
15068/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15069/// with frame index operands.
15070/// LLVM assumes that inputs are to these instructions are registers.
15072 SelectionDAG &DAG) const {
15073 if (Node->getOpcode() == ISD::CopyToReg) {
15074 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15075 SDValue SrcVal = Node->getOperand(2);
15076
15077 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15078 // to try understanding copies to physical registers.
15079 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15080 SDLoc SL(Node);
15082 SDValue VReg = DAG.getRegister(
15083 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15084
15085 SDNode *Glued = Node->getGluedNode();
15086 SDValue ToVReg
15087 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15088 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15089 SDValue ToResultReg
15090 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15091 VReg, ToVReg.getValue(1));
15092 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15093 DAG.RemoveDeadNode(Node);
15094 return ToResultReg.getNode();
15095 }
15096 }
15097
15099 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15100 if (!isFrameIndexOp(Node->getOperand(i))) {
15101 Ops.push_back(Node->getOperand(i));
15102 continue;
15103 }
15104
15105 SDLoc DL(Node);
15106 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15107 Node->getOperand(i).getValueType(),
15108 Node->getOperand(i)), 0));
15109 }
15110
15111 return DAG.UpdateNodeOperands(Node, Ops);
15112}
15113
15114/// Fold the instructions after selecting them.
15115/// Returns null if users were already updated.
15117 SelectionDAG &DAG) const {
15119 unsigned Opcode = Node->getMachineOpcode();
15120
15121 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15122 !TII->isGather4(Opcode) &&
15123 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15124 return adjustWritemask(Node, DAG);
15125 }
15126
15127 if (Opcode == AMDGPU::INSERT_SUBREG ||
15128 Opcode == AMDGPU::REG_SEQUENCE) {
15130 return Node;
15131 }
15132
15133 switch (Opcode) {
15134 case AMDGPU::V_DIV_SCALE_F32_e64:
15135 case AMDGPU::V_DIV_SCALE_F64_e64: {
15136 // Satisfy the operand register constraint when one of the inputs is
15137 // undefined. Ordinarily each undef value will have its own implicit_def of
15138 // a vreg, so force these to use a single register.
15139 SDValue Src0 = Node->getOperand(1);
15140 SDValue Src1 = Node->getOperand(3);
15141 SDValue Src2 = Node->getOperand(5);
15142
15143 if ((Src0.isMachineOpcode() &&
15144 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15145 (Src0 == Src1 || Src0 == Src2))
15146 break;
15147
15148 MVT VT = Src0.getValueType().getSimpleVT();
15149 const TargetRegisterClass *RC =
15150 getRegClassFor(VT, Src0.getNode()->isDivergent());
15151
15153 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15154
15155 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15156 UndefReg, Src0, SDValue());
15157
15158 // src0 must be the same register as src1 or src2, even if the value is
15159 // undefined, so make sure we don't violate this constraint.
15160 if (Src0.isMachineOpcode() &&
15161 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15162 if (Src1.isMachineOpcode() &&
15163 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15164 Src0 = Src1;
15165 else if (Src2.isMachineOpcode() &&
15166 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15167 Src0 = Src2;
15168 else {
15169 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15170 Src0 = UndefReg;
15171 Src1 = UndefReg;
15172 }
15173 } else
15174 break;
15175
15176 SmallVector<SDValue, 9> Ops(Node->ops());
15177 Ops[1] = Src0;
15178 Ops[3] = Src1;
15179 Ops[5] = Src2;
15180 Ops.push_back(ImpDef.getValue(1));
15181 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15182 }
15183 default:
15184 break;
15185 }
15186
15187 return Node;
15188}
15189
15190// Any MIMG instructions that use tfe or lwe require an initialization of the
15191// result register that will be written in the case of a memory access failure.
15192// The required code is also added to tie this init code to the result of the
15193// img instruction.
15196 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15198 MachineBasicBlock &MBB = *MI.getParent();
15199
15200 int DstIdx =
15201 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15202 unsigned InitIdx = 0;
15203
15204 if (TII->isImage(MI)) {
15205 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15206 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15207 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15208
15209 if (!TFE && !LWE) // intersect_ray
15210 return;
15211
15212 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15213 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15214 unsigned D16Val = D16 ? D16->getImm() : 0;
15215
15216 if (!TFEVal && !LWEVal)
15217 return;
15218
15219 // At least one of TFE or LWE are non-zero
15220 // We have to insert a suitable initialization of the result value and
15221 // tie this to the dest of the image instruction.
15222
15223 // Calculate which dword we have to initialize to 0.
15224 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15225
15226 // check that dmask operand is found.
15227 assert(MO_Dmask && "Expected dmask operand in instruction");
15228
15229 unsigned dmask = MO_Dmask->getImm();
15230 // Determine the number of active lanes taking into account the
15231 // Gather4 special case
15232 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15233
15234 bool Packed = !Subtarget->hasUnpackedD16VMem();
15235
15236 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15237
15238 // Abandon attempt if the dst size isn't large enough
15239 // - this is in fact an error but this is picked up elsewhere and
15240 // reported correctly.
15241 uint32_t DstSize =
15242 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15243 if (DstSize < InitIdx)
15244 return;
15245 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15246 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15247 } else {
15248 return;
15249 }
15250
15251 const DebugLoc &DL = MI.getDebugLoc();
15252
15253 // Create a register for the initialization value.
15254 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15255 unsigned NewDst = 0; // Final initialized value will be in here
15256
15257 // If PRTStrictNull feature is enabled (the default) then initialize
15258 // all the result registers to 0, otherwise just the error indication
15259 // register (VGPRn+1)
15260 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15261 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15262
15263 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15264 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15265 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15266 // Initialize dword
15267 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15269 .addImm(0);
15270 // Insert into the super-reg
15271 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15272 .addReg(PrevDst)
15273 .addReg(SubReg)
15275
15276 PrevDst = NewDst;
15277 }
15278
15279 // Add as an implicit operand
15280 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15281
15282 // Tie the just added implicit operand to the dst
15283 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15284}
15285
15286/// Assign the register class depending on the number of
15287/// bits set in the writemask
15289 SDNode *Node) const {
15291
15292 MachineFunction *MF = MI.getParent()->getParent();
15295
15296 if (TII->isVOP3(MI.getOpcode())) {
15297 // Make sure constant bus requirements are respected.
15298 TII->legalizeOperandsVOP3(MRI, MI);
15299
15300 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15301 // This saves a chain-copy of registers and better balance register
15302 // use between vgpr and agpr as agpr tuples tend to be big.
15303 if (!MI.getDesc().operands().empty()) {
15304 unsigned Opc = MI.getOpcode();
15305 bool HasAGPRs = Info->mayNeedAGPRs();
15306 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15307 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15308 for (auto I :
15309 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15310 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15311 if (I == -1)
15312 break;
15313 if ((I == Src2Idx) && (HasAGPRs))
15314 break;
15315 MachineOperand &Op = MI.getOperand(I);
15316 if (!Op.isReg() || !Op.getReg().isVirtual())
15317 continue;
15318 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15319 if (!TRI->hasAGPRs(RC))
15320 continue;
15321 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15322 if (!Src || !Src->isCopy() ||
15323 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15324 continue;
15325 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15326 // All uses of agpr64 and agpr32 can also accept vgpr except for
15327 // v_accvgpr_read, but we do not produce agpr reads during selection,
15328 // so no use checks are needed.
15329 MRI.setRegClass(Op.getReg(), NewRC);
15330 }
15331
15332 if (!HasAGPRs)
15333 return;
15334
15335 // Resolve the rest of AV operands to AGPRs.
15336 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15337 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15338 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15339 if (TRI->isVectorSuperClass(RC)) {
15340 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15341 MRI.setRegClass(Src2->getReg(), NewRC);
15342 if (Src2->isTied())
15343 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15344 }
15345 }
15346 }
15347 }
15348
15349 return;
15350 }
15351
15352 if (TII->isImage(MI))
15353 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15354}
15355
15357 uint64_t Val) {
15358 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15359 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15360}
15361
15363 const SDLoc &DL,
15364 SDValue Ptr) const {
15366
15367 // Build the half of the subregister with the constants before building the
15368 // full 128-bit register. If we are building multiple resource descriptors,
15369 // this will allow CSEing of the 2-component register.
15370 const SDValue Ops0[] = {
15371 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15372 buildSMovImm32(DAG, DL, 0),
15373 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15374 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15375 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15376 };
15377
15378 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15379 MVT::v2i32, Ops0), 0);
15380
15381 // Combine the constants and the pointer.
15382 const SDValue Ops1[] = {
15383 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15384 Ptr,
15385 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15386 SubRegHi,
15387 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15388 };
15389
15390 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15391}
15392
15393/// Return a resource descriptor with the 'Add TID' bit enabled
15394/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15395/// of the resource descriptor) to create an offset, which is added to
15396/// the resource pointer.
15398 SDValue Ptr, uint32_t RsrcDword1,
15399 uint64_t RsrcDword2And3) const {
15400 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15401 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15402 if (RsrcDword1) {
15403 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15404 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15405 0);
15406 }
15407
15408 SDValue DataLo = buildSMovImm32(DAG, DL,
15409 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15410 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15411
15412 const SDValue Ops[] = {
15413 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15414 PtrLo,
15415 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15416 PtrHi,
15417 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15418 DataLo,
15419 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15420 DataHi,
15421 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15422 };
15423
15424 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15425}
15426
15427//===----------------------------------------------------------------------===//
15428// SI Inline Assembly Support
15429//===----------------------------------------------------------------------===//
15430
15431std::pair<unsigned, const TargetRegisterClass *>
15433 StringRef Constraint,
15434 MVT VT) const {
15435 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15436
15437 const TargetRegisterClass *RC = nullptr;
15438 if (Constraint.size() == 1) {
15439 const unsigned BitWidth = VT.getSizeInBits();
15440 switch (Constraint[0]) {
15441 default:
15442 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15443 case 's':
15444 case 'r':
15445 switch (BitWidth) {
15446 case 16:
15447 RC = &AMDGPU::SReg_32RegClass;
15448 break;
15449 case 64:
15450 RC = &AMDGPU::SGPR_64RegClass;
15451 break;
15452 default:
15454 if (!RC)
15455 return std::pair(0U, nullptr);
15456 break;
15457 }
15458 break;
15459 case 'v':
15460 switch (BitWidth) {
15461 case 16:
15462 RC = &AMDGPU::VGPR_32RegClass;
15463 break;
15464 default:
15465 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15466 if (!RC)
15467 return std::pair(0U, nullptr);
15468 break;
15469 }
15470 break;
15471 case 'a':
15472 if (!Subtarget->hasMAIInsts())
15473 break;
15474 switch (BitWidth) {
15475 case 16:
15476 RC = &AMDGPU::AGPR_32RegClass;
15477 break;
15478 default:
15479 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15480 if (!RC)
15481 return std::pair(0U, nullptr);
15482 break;
15483 }
15484 break;
15485 }
15486 // We actually support i128, i16 and f16 as inline parameters
15487 // even if they are not reported as legal
15488 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15489 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15490 return std::pair(0U, RC);
15491 }
15492
15493 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15494 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15495 if (RegName.consume_front("v")) {
15496 RC = &AMDGPU::VGPR_32RegClass;
15497 } else if (RegName.consume_front("s")) {
15498 RC = &AMDGPU::SGPR_32RegClass;
15499 } else if (RegName.consume_front("a")) {
15500 RC = &AMDGPU::AGPR_32RegClass;
15501 }
15502
15503 if (RC) {
15504 uint32_t Idx;
15505 if (RegName.consume_front("[")) {
15506 uint32_t End;
15507 bool Failed = RegName.consumeInteger(10, Idx);
15508 Failed |= !RegName.consume_front(":");
15509 Failed |= RegName.consumeInteger(10, End);
15510 Failed |= !RegName.consume_back("]");
15511 if (!Failed) {
15512 uint32_t Width = (End - Idx + 1) * 32;
15513 MCRegister Reg = RC->getRegister(Idx);
15515 RC = TRI->getVGPRClassForBitWidth(Width);
15516 else if (SIRegisterInfo::isSGPRClass(RC))
15517 RC = TRI->getSGPRClassForBitWidth(Width);
15518 else if (SIRegisterInfo::isAGPRClass(RC))
15519 RC = TRI->getAGPRClassForBitWidth(Width);
15520 if (RC) {
15521 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15522 return std::pair(Reg, RC);
15523 }
15524 }
15525 } else {
15526 bool Failed = RegName.getAsInteger(10, Idx);
15527 if (!Failed && Idx < RC->getNumRegs())
15528 return std::pair(RC->getRegister(Idx), RC);
15529 }
15530 }
15531 }
15532
15533 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15534 if (Ret.first)
15535 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15536
15537 return Ret;
15538}
15539
15540static bool isImmConstraint(StringRef Constraint) {
15541 if (Constraint.size() == 1) {
15542 switch (Constraint[0]) {
15543 default: break;
15544 case 'I':
15545 case 'J':
15546 case 'A':
15547 case 'B':
15548 case 'C':
15549 return true;
15550 }
15551 } else if (Constraint == "DA" ||
15552 Constraint == "DB") {
15553 return true;
15554 }
15555 return false;
15556}
15557
15560 if (Constraint.size() == 1) {
15561 switch (Constraint[0]) {
15562 default: break;
15563 case 's':
15564 case 'v':
15565 case 'a':
15566 return C_RegisterClass;
15567 }
15568 }
15569 if (isImmConstraint(Constraint)) {
15570 return C_Other;
15571 }
15572 return TargetLowering::getConstraintType(Constraint);
15573}
15574
15575static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15577 Val = Val & maskTrailingOnes<uint64_t>(Size);
15578 }
15579 return Val;
15580}
15581
15583 StringRef Constraint,
15584 std::vector<SDValue> &Ops,
15585 SelectionDAG &DAG) const {
15586 if (isImmConstraint(Constraint)) {
15587 uint64_t Val;
15588 if (getAsmOperandConstVal(Op, Val) &&
15589 checkAsmConstraintVal(Op, Constraint, Val)) {
15590 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15591 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15592 }
15593 } else {
15594 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15595 }
15596}
15597
15599 unsigned Size = Op.getScalarValueSizeInBits();
15600 if (Size > 64)
15601 return false;
15602
15603 if (Size == 16 && !Subtarget->has16BitInsts())
15604 return false;
15605
15607 Val = C->getSExtValue();
15608 return true;
15609 }
15611 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15612 return true;
15613 }
15615 if (Size != 16 || Op.getNumOperands() != 2)
15616 return false;
15617 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15618 return false;
15619 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15620 Val = C->getSExtValue();
15621 return true;
15622 }
15623 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15624 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15625 return true;
15626 }
15627 }
15628
15629 return false;
15630}
15631
15633 uint64_t Val) const {
15634 if (Constraint.size() == 1) {
15635 switch (Constraint[0]) {
15636 case 'I':
15638 case 'J':
15639 return isInt<16>(Val);
15640 case 'A':
15641 return checkAsmConstraintValA(Op, Val);
15642 case 'B':
15643 return isInt<32>(Val);
15644 case 'C':
15645 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15647 default:
15648 break;
15649 }
15650 } else if (Constraint.size() == 2) {
15651 if (Constraint == "DA") {
15652 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15653 int64_t LoBits = static_cast<int32_t>(Val);
15654 return checkAsmConstraintValA(Op, HiBits, 32) &&
15655 checkAsmConstraintValA(Op, LoBits, 32);
15656 }
15657 if (Constraint == "DB") {
15658 return true;
15659 }
15660 }
15661 llvm_unreachable("Invalid asm constraint");
15662}
15663
15665 unsigned MaxSize) const {
15666 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15667 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15668 if (Size == 16) {
15669 MVT VT = Op.getSimpleValueType();
15670 switch (VT.SimpleTy) {
15671 default:
15672 return false;
15673 case MVT::i16:
15674 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15675 case MVT::f16:
15676 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15677 case MVT::bf16:
15678 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15679 case MVT::v2i16:
15680 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15681 case MVT::v2f16:
15682 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15683 case MVT::v2bf16:
15684 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15685 }
15686 }
15687 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15688 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15689 return true;
15690 return false;
15691}
15692
15693static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15694 switch (UnalignedClassID) {
15695 case AMDGPU::VReg_64RegClassID:
15696 return AMDGPU::VReg_64_Align2RegClassID;
15697 case AMDGPU::VReg_96RegClassID:
15698 return AMDGPU::VReg_96_Align2RegClassID;
15699 case AMDGPU::VReg_128RegClassID:
15700 return AMDGPU::VReg_128_Align2RegClassID;
15701 case AMDGPU::VReg_160RegClassID:
15702 return AMDGPU::VReg_160_Align2RegClassID;
15703 case AMDGPU::VReg_192RegClassID:
15704 return AMDGPU::VReg_192_Align2RegClassID;
15705 case AMDGPU::VReg_224RegClassID:
15706 return AMDGPU::VReg_224_Align2RegClassID;
15707 case AMDGPU::VReg_256RegClassID:
15708 return AMDGPU::VReg_256_Align2RegClassID;
15709 case AMDGPU::VReg_288RegClassID:
15710 return AMDGPU::VReg_288_Align2RegClassID;
15711 case AMDGPU::VReg_320RegClassID:
15712 return AMDGPU::VReg_320_Align2RegClassID;
15713 case AMDGPU::VReg_352RegClassID:
15714 return AMDGPU::VReg_352_Align2RegClassID;
15715 case AMDGPU::VReg_384RegClassID:
15716 return AMDGPU::VReg_384_Align2RegClassID;
15717 case AMDGPU::VReg_512RegClassID:
15718 return AMDGPU::VReg_512_Align2RegClassID;
15719 case AMDGPU::VReg_1024RegClassID:
15720 return AMDGPU::VReg_1024_Align2RegClassID;
15721 case AMDGPU::AReg_64RegClassID:
15722 return AMDGPU::AReg_64_Align2RegClassID;
15723 case AMDGPU::AReg_96RegClassID:
15724 return AMDGPU::AReg_96_Align2RegClassID;
15725 case AMDGPU::AReg_128RegClassID:
15726 return AMDGPU::AReg_128_Align2RegClassID;
15727 case AMDGPU::AReg_160RegClassID:
15728 return AMDGPU::AReg_160_Align2RegClassID;
15729 case AMDGPU::AReg_192RegClassID:
15730 return AMDGPU::AReg_192_Align2RegClassID;
15731 case AMDGPU::AReg_256RegClassID:
15732 return AMDGPU::AReg_256_Align2RegClassID;
15733 case AMDGPU::AReg_512RegClassID:
15734 return AMDGPU::AReg_512_Align2RegClassID;
15735 case AMDGPU::AReg_1024RegClassID:
15736 return AMDGPU::AReg_1024_Align2RegClassID;
15737 default:
15738 return -1;
15739 }
15740}
15741
15742// Figure out which registers should be reserved for stack access. Only after
15743// the function is legalized do we know all of the non-spill stack objects or if
15744// calls are present.
15748 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15749 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15750 const SIInstrInfo *TII = ST.getInstrInfo();
15751
15752 if (Info->isEntryFunction()) {
15753 // Callable functions have fixed registers used for stack access.
15755 }
15756
15757 // TODO: Move this logic to getReservedRegs()
15758 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15759 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15760 Register SReg = ST.isWave32()
15761 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15762 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15763 &AMDGPU::SGPR_64RegClass);
15764 Info->setSGPRForEXECCopy(SReg);
15765
15766 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15767 Info->getStackPtrOffsetReg()));
15768 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15769 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15770
15771 // We need to worry about replacing the default register with itself in case
15772 // of MIR testcases missing the MFI.
15773 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15774 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15775
15776 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15777 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15778
15779 Info->limitOccupancy(MF);
15780
15781 if (ST.isWave32() && !MF.empty()) {
15782 for (auto &MBB : MF) {
15783 for (auto &MI : MBB) {
15784 TII->fixImplicitOperands(MI);
15785 }
15786 }
15787 }
15788
15789 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15790 // classes if required. Ideally the register class constraints would differ
15791 // per-subtarget, but there's no easy way to achieve that right now. This is
15792 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15793 // from using them as the register class for legal types.
15794 if (ST.needsAlignedVGPRs()) {
15795 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15796 const Register Reg = Register::index2VirtReg(I);
15797 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15798 if (!RC)
15799 continue;
15800 int NewClassID = getAlignedAGPRClassID(RC->getID());
15801 if (NewClassID != -1)
15802 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15803 }
15804 }
15805
15807}
15808
15810 KnownBits &Known,
15811 const APInt &DemandedElts,
15812 const SelectionDAG &DAG,
15813 unsigned Depth) const {
15814 Known.resetAll();
15815 unsigned Opc = Op.getOpcode();
15816 switch (Opc) {
15818 unsigned IID = Op.getConstantOperandVal(0);
15819 switch (IID) {
15820 case Intrinsic::amdgcn_mbcnt_lo:
15821 case Intrinsic::amdgcn_mbcnt_hi: {
15822 const GCNSubtarget &ST =
15824 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15825 // most 31 + src1.
15826 Known.Zero.setBitsFrom(
15827 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
15828 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15829 Known = KnownBits::add(Known, Known2);
15830 return;
15831 }
15832 }
15833 break;
15834 }
15835 }
15837 Op, Known, DemandedElts, DAG, Depth);
15838}
15839
15841 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15843
15844 // Set the high bits to zero based on the maximum allowed scratch size per
15845 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15846 // calculation won't overflow, so assume the sign bit is never set.
15847 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15848}
15849
15851 KnownBits &Known, unsigned Dim) {
15852 unsigned MaxValue =
15853 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15854 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15855}
15856
15858 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15859 const MachineRegisterInfo &MRI, unsigned Depth) const {
15860 const MachineInstr *MI = MRI.getVRegDef(R);
15861 switch (MI->getOpcode()) {
15862 case AMDGPU::G_INTRINSIC:
15863 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15864 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
15865 switch (IID) {
15866 case Intrinsic::amdgcn_workitem_id_x:
15867 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15868 break;
15869 case Intrinsic::amdgcn_workitem_id_y:
15870 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15871 break;
15872 case Intrinsic::amdgcn_workitem_id_z:
15873 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15874 break;
15875 case Intrinsic::amdgcn_mbcnt_lo:
15876 case Intrinsic::amdgcn_mbcnt_hi: {
15877 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
15878 // most 31 + src1.
15879 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
15880 ? getSubtarget()->getWavefrontSizeLog2()
15881 : 5);
15882 KnownBits Known2;
15883 KB.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
15884 Depth + 1);
15885 Known = KnownBits::add(Known, Known2);
15886 break;
15887 }
15888 case Intrinsic::amdgcn_groupstaticsize: {
15889 // We can report everything over the maximum size as 0. We can't report
15890 // based on the actual size because we don't know if it's accurate or not
15891 // at any given point.
15892 Known.Zero.setHighBits(
15893 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15894 break;
15895 }
15896 }
15897 break;
15898 }
15899 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15900 Known.Zero.setHighBits(24);
15901 break;
15902 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15903 Known.Zero.setHighBits(16);
15904 break;
15905 case AMDGPU::G_AMDGPU_SMED3:
15906 case AMDGPU::G_AMDGPU_UMED3: {
15907 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15908
15909 KnownBits Known2;
15910 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15911 if (Known2.isUnknown())
15912 break;
15913
15914 KnownBits Known1;
15915 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15916 if (Known1.isUnknown())
15917 break;
15918
15919 KnownBits Known0;
15920 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15921 if (Known0.isUnknown())
15922 break;
15923
15924 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15925 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15926 Known.One = Known0.One & Known1.One & Known2.One;
15927 break;
15928 }
15929 }
15930}
15931
15934 unsigned Depth) const {
15935 const MachineInstr *MI = MRI.getVRegDef(R);
15936 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15937 // FIXME: Can this move to generic code? What about the case where the call
15938 // site specifies a lower alignment?
15939 Intrinsic::ID IID = GI->getIntrinsicID();
15941 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15942 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15943 return *RetAlign;
15944 }
15945 return Align(1);
15946}
15947
15950 const Align CacheLineAlign = Align(64);
15951
15952 // Pre-GFX10 target did not benefit from loop alignment
15953 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15954 getSubtarget()->hasInstFwdPrefetchBug())
15955 return PrefAlign;
15956
15957 // On GFX10 I$ is 4 x 64 bytes cache lines.
15958 // By default prefetcher keeps one cache line behind and reads two ahead.
15959 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15960 // behind and one ahead.
15961 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15962 // If loop fits 64 bytes it always spans no more than two cache lines and
15963 // does not need an alignment.
15964 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15965 // Else if loop is less or equal 192 bytes we need two lines behind.
15966
15968 const MachineBasicBlock *Header = ML->getHeader();
15969 if (Header->getAlignment() != PrefAlign)
15970 return Header->getAlignment(); // Already processed.
15971
15972 unsigned LoopSize = 0;
15973 for (const MachineBasicBlock *MBB : ML->blocks()) {
15974 // If inner loop block is aligned assume in average half of the alignment
15975 // size to be added as nops.
15976 if (MBB != Header)
15977 LoopSize += MBB->getAlignment().value() / 2;
15978
15979 for (const MachineInstr &MI : *MBB) {
15980 LoopSize += TII->getInstSizeInBytes(MI);
15981 if (LoopSize > 192)
15982 return PrefAlign;
15983 }
15984 }
15985
15986 if (LoopSize <= 64)
15987 return PrefAlign;
15988
15989 if (LoopSize <= 128)
15990 return CacheLineAlign;
15991
15992 // If any of parent loops is surrounded by prefetch instructions do not
15993 // insert new for inner loop, which would reset parent's settings.
15994 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15995 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15996 auto I = Exit->getFirstNonDebugInstr();
15997 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15998 return CacheLineAlign;
15999 }
16000 }
16001
16002 MachineBasicBlock *Pre = ML->getLoopPreheader();
16003 MachineBasicBlock *Exit = ML->getExitBlock();
16004
16005 if (Pre && Exit) {
16006 auto PreTerm = Pre->getFirstTerminator();
16007 if (PreTerm == Pre->begin() ||
16008 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
16009 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16010 .addImm(1); // prefetch 2 lines behind PC
16011
16012 auto ExitHead = Exit->getFirstNonDebugInstr();
16013 if (ExitHead == Exit->end() ||
16014 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
16015 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
16016 .addImm(2); // prefetch 1 line behind PC
16017 }
16018
16019 return CacheLineAlign;
16020}
16021
16023static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
16024 assert(N->getOpcode() == ISD::CopyFromReg);
16025 do {
16026 // Follow the chain until we find an INLINEASM node.
16027 N = N->getOperand(0).getNode();
16028 if (N->getOpcode() == ISD::INLINEASM ||
16029 N->getOpcode() == ISD::INLINEASM_BR)
16030 return true;
16031 } while (N->getOpcode() == ISD::CopyFromReg);
16032 return false;
16033}
16034
16037 UniformityInfo *UA) const {
16038 switch (N->getOpcode()) {
16039 case ISD::CopyFromReg: {
16040 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
16041 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
16042 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16043 Register Reg = R->getReg();
16044
16045 // FIXME: Why does this need to consider isLiveIn?
16046 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
16047 return !TRI->isSGPRReg(MRI, Reg);
16048
16049 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
16050 return UA->isDivergent(V);
16051
16052 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
16053 return !TRI->isSGPRReg(MRI, Reg);
16054 }
16055 case ISD::LOAD: {
16056 const LoadSDNode *L = cast<LoadSDNode>(N);
16057 unsigned AS = L->getAddressSpace();
16058 // A flat load may access private memory.
16060 }
16061 case ISD::CALLSEQ_END:
16062 return true;
16064 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16066 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16085 // Target-specific read-modify-write atomics are sources of divergence.
16086 return true;
16087 default:
16088 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16089 // Generic read-modify-write atomics are sources of divergence.
16090 return A->readMem() && A->writeMem();
16091 }
16092 return false;
16093 }
16094}
16095
16097 EVT VT) const {
16098 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16099 case MVT::f32:
16101 case MVT::f64:
16102 case MVT::f16:
16104 default:
16105 return false;
16106 }
16107}
16108
16110 LLT Ty, const MachineFunction &MF) const {
16111 switch (Ty.getScalarSizeInBits()) {
16112 case 32:
16113 return !denormalModeIsFlushAllF32(MF);
16114 case 64:
16115 case 16:
16116 return !denormalModeIsFlushAllF64F16(MF);
16117 default:
16118 return false;
16119 }
16120}
16121
16123 const SelectionDAG &DAG,
16124 bool SNaN,
16125 unsigned Depth) const {
16126 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16127 const MachineFunction &MF = DAG.getMachineFunction();
16129
16130 if (Info->getMode().DX10Clamp)
16131 return true; // Clamped to 0.
16132 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16133 }
16134
16136 SNaN, Depth);
16137}
16138
16139// On older subtargets, global FP atomic instructions have a hardcoded FP mode
16140// and do not support FP32 denormals, and only support v2f16/f64 denormals.
16142 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
16143 return true;
16144
16146 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
16147 if (DenormMode == DenormalMode::getPreserveSign())
16148 return true;
16149
16150 // TODO: Remove this.
16151 return RMW->getFunction()
16152 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
16153 .getValueAsBool();
16154}
16155
16157 LLVMContext &Ctx = RMW->getContext();
16158 StringRef SS = Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("");
16159 StringRef MemScope = SS.empty() ? StringRef("system") : SS;
16160
16161 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16162 << "Hardware instruction generated for atomic "
16163 << RMW->getOperationName(RMW->getOperation())
16164 << " operation at memory scope " << MemScope;
16165}
16166
16167static bool isV2F16OrV2BF16(Type *Ty) {
16168 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
16169 Type *EltTy = VT->getElementType();
16170 return VT->getNumElements() == 2 &&
16171 (EltTy->isHalfTy() || EltTy->isBFloatTy());
16172 }
16173
16174 return false;
16175}
16176
16177static bool isV2F16(Type *Ty) {
16179 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
16180}
16181
16182static bool isV2BF16(Type *Ty) {
16184 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
16185}
16186
16187/// \return true if atomicrmw integer ops work for the type.
16188static bool isAtomicRMWLegalIntTy(Type *Ty) {
16189 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
16190 unsigned BW = IT->getBitWidth();
16191 return BW == 32 || BW == 64;
16192 }
16193
16194 return false;
16195}
16196
16197/// \return true if this atomicrmw xchg type can be selected.
16198static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
16199 Type *Ty = RMW->getType();
16200 if (isAtomicRMWLegalIntTy(Ty))
16201 return true;
16202
16203 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
16204 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
16205 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
16206 return BW == 32 || BW == 64;
16207 }
16208
16209 if (Ty->isFloatTy() || Ty->isDoubleTy())
16210 return true;
16211
16213 return VT->getNumElements() == 2 &&
16214 VT->getElementType()->getPrimitiveSizeInBits() == 16;
16215 }
16216
16217 return false;
16218}
16219
16220/// \returns true if it's valid to emit a native instruction for \p RMW, based
16221/// on the properties of the target memory.
16222static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
16223 const AtomicRMWInst *RMW,
16224 bool HasSystemScope) {
16225 // The remote/fine-grained access logic is different from the integer
16226 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
16227 // fine-grained access does not work, even for a device local allocation.
16228 //
16229 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
16230 // allocations work.
16231 if (HasSystemScope) {
16233 RMW->hasMetadata("amdgpu.no.remote.memory"))
16234 return true;
16236 return true;
16237
16238 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
16239}
16240
16241/// \return Action to perform on AtomicRMWInsts for integer operations.
16248
16251 unsigned AS = RMW->getPointerAddressSpace();
16252 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16254
16255 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16257 ORE.emit([=]() {
16258 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16259 });
16260 return Kind;
16261 };
16262
16263 auto SSID = RMW->getSyncScopeID();
16264 bool HasSystemScope =
16265 SSID == SyncScope::System ||
16266 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16267
16268 auto Op = RMW->getOperation();
16269 switch (Op) {
16270 case AtomicRMWInst::Xchg: {
16271 // PCIe supports add and xchg for system atomics.
16272 return isAtomicRMWLegalXChgTy(RMW)
16275 }
16276 case AtomicRMWInst::Add:
16277 case AtomicRMWInst::And:
16281 case AtomicRMWInst::Sub:
16282 case AtomicRMWInst::Or:
16283 case AtomicRMWInst::Xor: {
16284 // Atomic sub/or/xor do not work over PCI express, but atomic add
16285 // does. InstCombine transforms these with 0 to or, so undo that.
16286 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16287 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16288 ConstVal && ConstVal->isNullValue())
16290 }
16291
16293 }
16294 case AtomicRMWInst::FAdd: {
16295 Type *Ty = RMW->getType();
16296
16297 // TODO: Handle REGION_ADDRESS
16298 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16299 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16300 // is fixed to round-to-nearest-even.
16301 //
16302 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16303 // round-to-nearest-even.
16304 //
16305 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16306 // suggests it is OK if the floating-point mode may not match the calling
16307 // thread.
16308 if (Ty->isFloatTy()) {
16311 }
16312
16313 if (Ty->isDoubleTy()) {
16314 // Ignores denormal mode, but we don't consider flushing mandatory.
16317 }
16318
16319 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16321
16323 }
16324
16325 // LDS atomics respect the denormal mode from the mode register.
16326 //
16327 // Traditionally f32 global/buffer memory atomics would unconditionally
16328 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
16329 // flush.
16330 //
16331 // On targets with flat atomic fadd, denormals would flush depending on
16332 // whether the target address resides in LDS or global memory. We consider
16333 // this flat-maybe-flush as will-flush.
16334 if (Ty->isFloatTy() &&
16338
16339 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
16340 // safe. The message phrasing also should be better.
16341 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16342 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16343 // gfx940, gfx12
16344 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
16345 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16346 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
16347 // gfx90a, gfx940, gfx12
16348 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16349 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16350
16351 // gfx940, gfx12
16352 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
16353 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16354 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16355 // gfx90a, gfx940, gfx12
16356 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
16357 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16358
16359 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for
16360 // buffer. gfx12 does have the buffer version.
16361 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
16362 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16363 }
16364
16365 // global and flat atomic fadd f64: gfx90a, gfx940.
16366 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
16367 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16368
16369 if (AS != AMDGPUAS::FLAT_ADDRESS) {
16370 if (Ty->isFloatTy()) {
16371 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940,
16372 // gfx11+.
16373 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16374 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16375 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16376 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16377 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16378 } else {
16379 // gfx908
16380 if (RMW->use_empty() &&
16382 isV2F16(Ty))
16383 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16384 }
16385 }
16386
16387 // flat atomic fadd f32: gfx940, gfx11+.
16388 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16389 if (Subtarget->hasFlatAtomicFaddF32Inst())
16390 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16391
16392 // If it is in flat address space, and the type is float, we will try to
16393 // expand it, if the target supports global and lds atomic fadd. The
16394 // reason we need that is, in the expansion, we emit the check of
16395 // address space. If it is in global address space, we emit the global
16396 // atomic fadd; if it is in shared address space, we emit the LDS atomic
16397 // fadd.
16398 if (Subtarget->hasLDSFPAtomicAddF32()) {
16399 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16401 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16403 }
16404 }
16405 }
16406
16408 }
16410 case AtomicRMWInst::FMax: {
16411 Type *Ty = RMW->getType();
16412
16413 // LDS float and double fmin/fmax were always supported.
16414 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16415 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
16417 }
16418
16419 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
16420 // For flat and global cases:
16421 // float, double in gfx7. Manual claims denormal support.
16422 // Removed in gfx8.
16423 // float, double restored in gfx10.
16424 // double removed again in gfx11, so only f32 for gfx11/gfx12.
16425 //
16426 // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but
16427 // no f32.
16428 if (AS == AMDGPUAS::FLAT_ADDRESS) {
16429 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
16430 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16431 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
16432 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16433 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
16435 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
16436 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16437 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
16438 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16439 }
16440 }
16441
16443 }
16444 case AtomicRMWInst::Min:
16445 case AtomicRMWInst::Max:
16447 case AtomicRMWInst::UMax: {
16450 // Always expand system scope min/max atomics.
16451 if (HasSystemScope)
16453 }
16454
16456 }
16459 default:
16461 }
16462
16463 llvm_unreachable("covered atomicrmw op switch");
16464}
16465
16472
16479
16486
16487const TargetRegisterClass *
16488SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16490 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16491 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16492 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16493 : &AMDGPU::SReg_32RegClass;
16494 if (!TRI->isSGPRClass(RC) && !isDivergent)
16495 return TRI->getEquivalentSGPRClass(RC);
16496 if (TRI->isSGPRClass(RC) && isDivergent)
16497 return TRI->getEquivalentVGPRClass(RC);
16498
16499 return RC;
16500}
16501
16502// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16503// uniform values (as produced by the mask results of control flow intrinsics)
16504// used outside of divergent blocks. The phi users need to also be treated as
16505// always uniform.
16506//
16507// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16508static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16509 unsigned WaveSize) {
16510 // FIXME: We assume we never cast the mask results of a control flow
16511 // intrinsic.
16512 // Early exit if the type won't be consistent as a compile time hack.
16513 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16514 if (!IT || IT->getBitWidth() != WaveSize)
16515 return false;
16516
16517 if (!isa<Instruction>(V))
16518 return false;
16519 if (!Visited.insert(V).second)
16520 return false;
16521 bool Result = false;
16522 for (const auto *U : V->users()) {
16523 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16524 if (V == U->getOperand(1)) {
16525 switch (Intrinsic->getIntrinsicID()) {
16526 default:
16527 Result = false;
16528 break;
16529 case Intrinsic::amdgcn_if_break:
16530 case Intrinsic::amdgcn_if:
16531 case Intrinsic::amdgcn_else:
16532 Result = true;
16533 break;
16534 }
16535 }
16536 if (V == U->getOperand(0)) {
16537 switch (Intrinsic->getIntrinsicID()) {
16538 default:
16539 Result = false;
16540 break;
16541 case Intrinsic::amdgcn_end_cf:
16542 case Intrinsic::amdgcn_loop:
16543 Result = true;
16544 break;
16545 }
16546 }
16547 } else {
16548 Result = hasCFUser(U, Visited, WaveSize);
16549 }
16550 if (Result)
16551 break;
16552 }
16553 return Result;
16554}
16555
16557 const Value *V) const {
16558 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16559 if (CI->isInlineAsm()) {
16560 // FIXME: This cannot give a correct answer. This should only trigger in
16561 // the case where inline asm returns mixed SGPR and VGPR results, used
16562 // outside the defining block. We don't have a specific result to
16563 // consider, so this assumes if any value is SGPR, the overall register
16564 // also needs to be SGPR.
16565 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16567 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16568 for (auto &TC : TargetConstraints) {
16569 if (TC.Type == InlineAsm::isOutput) {
16572 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16573 if (RC && SIRI->isSGPRClass(RC))
16574 return true;
16575 }
16576 }
16577 }
16578 }
16580 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16581}
16582
16585 for (; I != E; ++I) {
16586 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16587 if (getBasePtrIndex(M) == I.getOperandNo())
16588 return true;
16589 }
16590 }
16591 return false;
16592}
16593
16595 SDValue N1) const {
16596 if (!N0.hasOneUse())
16597 return false;
16598 // Take care of the opportunity to keep N0 uniform
16599 if (N0->isDivergent() || !N1->isDivergent())
16600 return true;
16601 // Check if we have a good chance to form the memory access pattern with the
16602 // base and offset
16603 return (DAG.isBaseWithConstantOffset(N0) &&
16604 hasMemSDNodeUser(*N0->use_begin()));
16605}
16606
16608 Register N0, Register N1) const {
16609 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16610}
16611
16614 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16616 if (I.getMetadata("amdgpu.noclobber"))
16617 Flags |= MONoClobber;
16618 if (I.getMetadata("amdgpu.last.use"))
16619 Flags |= MOLastUse;
16620 return Flags;
16621}
16622
16624 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16625 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16626 if (User->getOpcode() != ISD::CopyToReg)
16627 return false;
16628 if (!Def->isMachineOpcode())
16629 return false;
16631 if (!MDef)
16632 return false;
16633
16634 unsigned ResNo = User->getOperand(Op).getResNo();
16635 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16636 return false;
16637 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16638 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16639 PhysReg = AMDGPU::SCC;
16640 const TargetRegisterClass *RC =
16641 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16642 Cost = RC->getCopyCost();
16643 return true;
16644 }
16645 return false;
16646}
16647
16650
16653 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16654 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16655 "this cannot be replaced with add");
16657 return;
16658 }
16659
16660 assert(Subtarget->hasAtomicFaddInsts() &&
16661 "target should have atomic fadd instructions");
16662 assert(AI->getType()->isFloatTy() &&
16664 "generic atomicrmw expansion only supports FP32 operand in flat "
16665 "address space");
16666 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16667
16668 // Given: atomicrmw fadd ptr %addr, float %val ordering
16669 //
16670 // With this expansion we produce the following code:
16671 // [...]
16672 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16673 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16674 //
16675 // atomicrmw.shared:
16676 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16677 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16678 // float %val ordering
16679 // br label %atomicrmw.phi
16680 //
16681 // atomicrmw.check.private:
16682 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16683 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16684 //
16685 // atomicrmw.private:
16686 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16687 // %loaded.private = load float, ptr addrspace(5) %cast.private
16688 // %val.new = fadd float %loaded.private, %val
16689 // store float %val.new, ptr addrspace(5) %cast.private
16690 // br label %atomicrmw.phi
16691 //
16692 // atomicrmw.global:
16693 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16694 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16695 // float %val ordering
16696 // br label %atomicrmw.phi
16697 //
16698 // atomicrmw.phi:
16699 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16700 // [ %loaded.private, %atomicrmw.private ],
16701 // [ %loaded.global, %atomicrmw.global ]
16702 // br label %atomicrmw.end
16703 //
16704 // atomicrmw.end:
16705 // [...]
16706
16707 IRBuilder<> Builder(AI);
16708 LLVMContext &Ctx = Builder.getContext();
16709
16710 // If the return value isn't used, do not introduce a false use in the phi.
16711 bool ReturnValueIsUsed = !AI->use_empty();
16712
16713 BasicBlock *BB = Builder.GetInsertBlock();
16714 Function *F = BB->getParent();
16715 BasicBlock *ExitBB =
16716 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16717 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16718 BasicBlock *CheckPrivateBB =
16719 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16720 BasicBlock *PrivateBB =
16721 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16722 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16723 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16724
16725 Value *Val = AI->getValOperand();
16726 Type *ValTy = Val->getType();
16727 Value *Addr = AI->getPointerOperand();
16728 Align Alignment = AI->getAlign();
16729
16730 std::prev(BB->end())->eraseFromParent();
16731 Builder.SetInsertPoint(BB);
16732 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16733 {Addr}, nullptr, "is.shared");
16734 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16735
16736 Builder.SetInsertPoint(SharedBB);
16737 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16739
16740 Instruction *Clone = AI->clone();
16741 Clone->insertInto(SharedBB, SharedBB->end());
16743 .set(CastToLocal);
16744 Instruction *LoadedShared = Clone;
16745
16746 Builder.CreateBr(PhiBB);
16747
16748 Builder.SetInsertPoint(CheckPrivateBB);
16749 CallInst *IsPrivate = Builder.CreateIntrinsic(
16750 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16751 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16752
16753 Builder.SetInsertPoint(PrivateBB);
16754 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16756 Value *LoadedPrivate = Builder.CreateAlignedLoad(ValTy, CastToPrivate,
16757 Alignment, "loaded.private");
16758
16759 Value *NewVal = buildAtomicRMWValue(Op, Builder, LoadedPrivate, Val);
16760
16761 Builder.CreateAlignedStore(NewVal, CastToPrivate, Alignment);
16762 Builder.CreateBr(PhiBB);
16763
16764 Builder.SetInsertPoint(GlobalBB);
16765 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16767 Value *LoadedGlobal = AI;
16768
16770
16771 AI->removeFromParent();
16772 AI->insertInto(GlobalBB, GlobalBB->end());
16773
16774 Builder.CreateBr(PhiBB);
16775
16776 Builder.SetInsertPoint(PhiBB);
16777
16778 if (ReturnValueIsUsed) {
16779 PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
16780 AI->replaceAllUsesWith(Loaded);
16781 Loaded->addIncoming(LoadedShared, SharedBB);
16782 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16783 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16784 Loaded->takeName(AI);
16785 }
16786
16787 Builder.CreateBr(ExitBB);
16788}
16789
16790LoadInst *
16792 IRBuilder<> Builder(AI);
16793 auto Order = AI->getOrdering();
16794
16795 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16796 // must be flushed if the atomic ordering had a release semantics. This is
16797 // not necessary a fence, a release fence just coincides to do that flush.
16798 // Avoid replacing of an atomicrmw with a release semantics.
16799 if (isReleaseOrStronger(Order))
16800 return nullptr;
16801
16802 LoadInst *LI = Builder.CreateAlignedLoad(
16803 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16804 LI->setAtomic(Order, AI->getSyncScopeID());
16805 LI->copyMetadata(*AI);
16806 LI->takeName(AI);
16807 AI->replaceAllUsesWith(LI);
16808 AI->eraseFromParent();
16809 return LI;
16810}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
RelocType Type
Definition COFFYAML.cpp:391
#define LLVM_ATTRIBUTE_UNUSED
Definition Compiler.h:270
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
#define LLVM_DEBUG(X)
Definition Debug.h:101
uint64_t Align
uint64_t Addr
uint64_t Size
bool End
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
unsigned Reg
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
const MachineOperand & RHS
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1196
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1193
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:469
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:166
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1032
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5337
bool isNegative() const
Definition APFloat.h:1360
APInt bitcastToAPInt() const
Definition APFloat.h:1266
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1050
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1010
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:994
bool isInfinity() const
Definition APFloat.h:1357
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1385
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1379
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:251
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:459
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1611
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:289
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1230
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1214
This class represents an incoming formal argument to a Function.
Definition Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Definition ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a function, its return value, and its parameters.
Definition Attributes.h:468
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:61
iterator end()
Definition BasicBlock.h:461
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:219
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
BitVector & set()
Definition BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:673
@ ICMP_NE
not equal
Definition InstrTypes.h:695
bool isSigned() const
Definition InstrTypes.h:923
bool isFPPredicate() const
Definition InstrTypes.h:780
bool isIntPredicate() const
Definition InstrTypes.h:781
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:83
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:208
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:42
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:196
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition DataLayout.h:459
A debug info location.
Definition DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:216
iterator_range< arg_iterator > args()
Definition Function.h:889
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:773
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:277
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:380
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:814
bool hasPrefetch() const
bool hasMemoryAtomicFaddF32DenormalSupport() const
bool hasD16Images() const
bool useVGPRIndexMode() const
bool hasAtomicDsPkAdd16Insts() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasAtomicFMinFMaxF64FlatInsts() const
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasAtomicFMinFMaxF32FlatInsts() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
bool hasBCNT(unsigned Size) const
bool hasMAIInsts() const
bool supportsAgentScopeFineGrainedRemoteMemoryAtomics() const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasMadF16() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
bool useDS128() const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasAtomicFMinFMaxF32GlobalInsts() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasIntClamp() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasFFBL() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasAtomicFMinFMaxF64GlobalInsts() const
bool hasMed3_16() const
bool hasMovrel() const
bool hasAtomicFlatPkAdd16Insts() const
bool hasBFI() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasFFBH() const
bool hasAtomicFaddInsts() const
bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const
bool hasAtomicBufferPkAddBF16Inst() const
bool hasAtomicFaddNoRtnInsts() const
bool hasFlatBufferGlobalAtomicFaddF64Inst() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasAtomicBufferGlobalPkAddF16Insts() const
bool hasScalarAddSub64() const
bool hasIEEEMinMax3() const
bool hasUnpackedD16VMem() const
bool hasAtomicGlobalPkAddBF16Inst() const
bool hasAddr64() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasFractBug() const
bool hasGDS() const
bool hasBFE() const
bool hasPrivateSegmentBuffer() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2692
Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
Metadata node.
Definition Metadata.h:1069
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:195
Root of the metadata hierarchy.
Definition Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:291
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
SDNode * getGluedNode() const
If this node has a glue operand, return the node to which the glue operand points.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getRegister(Register Reg, EVT VT)
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_t size() const
Definition SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:850
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:51
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:262
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:149
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:143
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:274
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:386
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:251
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:342
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:242
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:224
const fltSemantics & getFltSemantics() const
Definition Type.cpp:71
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:43
void set(Value *Val)
Definition Value.h:882
const Use & getOperandUse(unsigned i) const
Definition User.h:241
Value * getOperand(unsigned i) const
Definition User.h:228
LLVM Value Representation.
Definition Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:534
bool use_empty() const
Definition Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1075
iterator_range< use_iterator > uses()
Definition Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:383
Type * getElementType() const
constexpr bool isZero() const
Definition TypeSize.h:156
const ParentTy * getParent() const
Definition ilist_node.h:32
self_iterator getIterator()
Definition ilist_node.h:132
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:779
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:243
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:752
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:44
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:743
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:501
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:246
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:205
@ GlobalAddress
Definition ISDOpcodes.h:78
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:840
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:557
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:716
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:236
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ CONVERGENCECTRL_GLUE
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:804
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:634
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:751
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:514
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:521
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:756
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:218
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:229
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:215
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:930
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:734
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:614
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:587
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:810
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:771
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:338
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:848
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:696
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:310
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:479
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:886
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:708
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:190
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:52
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:919
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:816
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:793
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:347
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:198
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:529
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:57
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
Definition MathExtras.h:54
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:329
@ Offset
Definition DWP.cpp:480
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:244
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:169
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:649
@ Done
Definition Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2099
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:555
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:394
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:285
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:340
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:291
T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:81
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition Error.cpp:167
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:193
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:159
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:548
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:155
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:235
ArrayRef(const T &OneElt) -> ArrayRef< T >
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1750
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1887
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:212
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:860
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:254
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition APFloat.cpp:279
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:389
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:294
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:367
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:237
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:113
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:379
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:464
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:406
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:310
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:317
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:250
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:322
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:330
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:62
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:70
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:333
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:237
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
void setNoUnsignedWrap(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs